Implement per-service disk usage monitoring

Replaced system-wide disk usage with accurate per-service tracking by scanning
service-specific directories. Services like sshd now correctly show minimal
disk usage instead of misleading system totals.

- Rename storage widget and add drive capacity/usage columns
- Move host display to main dashboard title for cleaner layout
- Replace separate alert displays with color-coded row highlighting
- Add per-service disk usage collection using du command
- Update services widget formatting to handle small disk values
- Restructure into workspace with dedicated agent and dashboard packages
This commit is contained in:
Christoffer Martinsson 2025-10-11 22:59:16 +02:00
parent 82afe3d4f1
commit 2581435b10
30 changed files with 4801 additions and 446 deletions

View File

@ -14,10 +14,11 @@ A high-performance Rust-based TUI dashboard for monitoring CMTEC infrastructure.
### Key Features
- **NVMe health monitoring** with wear prediction
- **RAM optimization tracking** (tmpfs, zram, kernel metrics)
- **Service resource monitoring** with sandboxed limits
- **CPU / memory / GPU telemetry** with automatic thresholding
- **Service resource monitoring** with per-service CPU and RAM usage
- **Disk usage overview** for root filesystems
- **Backup status** with detailed metrics and history
- **Email notification integration**
- **Unified alert pipeline** summarising host health
- **Historical data tracking** and trend analysis
## Technical Architecture
@ -93,8 +94,10 @@ cm-dashboard/
2. **Service Metrics API** (port 6128)
- Service status and resource usage
- Memory consumption vs limits
- Disk usage per service
- Service memory consumption vs limits
- Host CPU load / frequency / temperature
- Root disk utilisation snapshot
- GPU utilisation and temperature (if available)
3. **Backup Metrics API** (port 6129)
- Backup status and history
@ -119,6 +122,26 @@ pub struct ServiceMetrics {
pub timestamp: u64,
}
#[derive(Deserialize, Debug)]
pub struct ServiceSummary {
pub healthy: usize,
pub degraded: usize,
pub failed: usize,
pub memory_used_mb: f32,
pub memory_quota_mb: f32,
pub system_memory_used_mb: f32,
pub system_memory_total_mb: f32,
pub disk_used_gb: f32,
pub disk_total_gb: f32,
pub cpu_load_1: f32,
pub cpu_load_5: f32,
pub cpu_load_15: f32,
pub cpu_freq_mhz: Option<f32>,
pub cpu_temp_c: Option<f32>,
pub gpu_load_percent: Option<f32>,
pub gpu_temp_c: Option<f32>,
}
#[derive(Deserialize, Debug)]
pub struct BackupMetrics {
pub overall_status: String,
@ -617,4 +640,4 @@ smartmontools-rs = "0.1" # Or direct smartctl bindings
**Performance Targets**:
- **Agent footprint**: < 2MB RAM, < 1% CPU
- **Metric latency**: < 100ms propagation across network
- **Network efficiency**: < 1KB/s per host steady state
- **Network efficiency**: < 1KB/s per host steady state

122
Cargo.lock generated
View File

@ -98,6 +98,17 @@ version = "1.0.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
[[package]]
name = "async-trait"
version = "0.1.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "autocfg"
version = "1.5.0"
@ -240,6 +251,7 @@ dependencies = [
"clap",
"cm-dashboard-shared",
"crossterm",
"gethostname",
"ratatui",
"serde",
"serde_json",
@ -256,13 +268,18 @@ name = "cm-dashboard-agent"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"chrono",
"clap",
"cm-dashboard-shared",
"futures",
"gethostname",
"rand",
"serde",
"serde_json",
"thiserror",
"tokio",
"toml",
"tracing",
"tracing-appender",
"tracing-subscriber",
@ -415,6 +432,105 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7"
[[package]]
name = "futures-task"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
[[package]]
name = "futures-util"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
name = "gethostname"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0176e0459c2e4a1fe232f984bca6890e681076abb9934f6cea7c326f3fc47818"
dependencies = [
"libc",
"windows-targets 0.48.5",
]
[[package]]
name = "getrandom"
version = "0.2.16"
@ -738,6 +854,12 @@ version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
name = "pin-utils"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
[[package]]
name = "pkg-config"
version = "0.3.32"

View File

@ -6,20 +6,22 @@ CM Dashboard is a Rust-powered terminal UI for real-time monitoring of CMTEC inf
┌──────────────────────────────────────────────────────────────────────────────┐
│ CM Dashboard │
├────────────────────────────┬────────────────────────────┬────────────────────┤
│ NVMe Health │ Services │ Memory Optimization
│ NVMe Health │ Services │ CPU / Memory
│ Host: srv01 │ Host: srv01 │ Host: srv01 │
│ Status: Healthy │ Services healthy: 5 │ Memory used: 2048 / │
│ Drives healthy/warn/crit: │ Degraded: 1 Failed: 0 │ 4096 MiB (50.0%) │
│ 4/0/0 │ CPU top service: 71.3% │ Last update: 12:34: │
│ Capacity used: 512.0 / │ Total memory: 1536 / 2048 │ 56 │
│ 2048.0 GiB │ MiB │ │
│ Status: Healthy │ Service memory: 1.2G/4.0G │ RAM: 6.9 / 7.8 GiB │
│ Healthy/Warning/Critical: │ Disk usage: 45 / 500 GiB │ CPU load (1/5/15): │
│ 4 / 0 / 0 │ Services tracked: 8 │ 1.2 0.9 0.7 │
│ Capacity used: 512 / 2048G │ │ CPU temp: 68°C │
│ Issue: — │ nginx running 320M │ GPU temp: — │
│ │ immich running 1.2G │ Status • ok │
│ │ backup-api running 40M │ │
├────────────────────────────┴────────────┬───────────────┴────────────────────┤
│ Backups │ Alerts │
│ Host: srv01 │ srv01: OK
Status: Healthy │ labbox: smart warning
│ Last success: 2024-02-01 03:12:45 │
│ Snapshots: 17 • Size: 512.0 GiB │
│ Pending jobs: 0 (enabled: true) │
│ Host: srv01 │ srv01: ok
Overall: Healthy │ labbox: warning: RAM 82%
│ Last success: 2024-02-01 03:12:45 │ cmbox: critical: CPU temp 92°C
│ Snapshots: 17 • Size: 512.0 GiB │ Update: 2024-02-01 10:15:32
│ Pending jobs: 0 (enabled: true) │ │
└──────────────────────────────┬───────────────────────────────────────────────┘
│ Status │ │
│ Active host: srv01 (1/3) │ History retention ≈ 3600s │
@ -99,7 +101,8 @@ Adjust the host list and `data_source.zmq.endpoints` to match your CMTEC gossip
## Features
- Rotating host selection with left/right arrows (`←`, `→`, `h`, `l`, `Tab`)
- Live NVMe, service, memory, backup, and alert summaries per active host
- Live NVMe, service, CPU/memory, backup, and alert panels per host
- Health scoring that rolls CPU/RAM/GPU pressure into alerts automatically
- Structured logging with `tracing` (`-v`/`-vv` to increase verbosity)
- Help overlay (`?`) outlining keyboard shortcuts
- Config-driven host discovery via `config/dashboard.toml`

View File

@ -6,13 +6,18 @@ edition = "2021"
[dependencies]
cm-dashboard-shared = { path = "../shared" }
anyhow = "1.0"
async-trait = "0.1"
clap = { version = "4.0", features = ["derive"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] }
thiserror = "1.0"
toml = "0.8"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
tracing-appender = "0.2"
zmq = "0.10"
tokio = { version = "1.0", features = ["full"] }
tokio = { version = "1.0", features = ["full", "process"] }
futures = "0.3"
rand = "0.8"
gethostname = "0.4"

263
agent/src/agent.rs Normal file
View File

@ -0,0 +1,263 @@
use cm_dashboard_shared::envelope::MessageEnvelope;
use std::sync::Arc;
use tokio::sync::mpsc;
use tokio::time::{interval, Duration};
use tracing::{debug, error, info, warn};
use zmq::{Context, SocketType};
use crate::collectors::{
backup::BackupCollector, service::ServiceCollector, smart::SmartCollector, AgentType,
CollectorOutput,
};
use crate::config::AgentConfig;
use crate::scheduler::{CollectorScheduler, HealthChecker, HealthStatus};
pub struct MetricsAgent {
config: AgentConfig,
scheduler: CollectorScheduler,
health_checker: Option<HealthChecker>,
}
impl MetricsAgent {
pub fn from_config(config: AgentConfig) -> Result<Self, Box<dyn std::error::Error>> {
let mut agent = Self::new(config)?;
agent.initialize_collectors()?;
Ok(agent)
}
pub fn new(config: AgentConfig) -> Result<Self, Box<dyn std::error::Error>> {
Ok(Self {
config,
scheduler: CollectorScheduler::new(),
health_checker: None,
})
}
pub fn initialize_collectors(&mut self) -> Result<(), Box<dyn std::error::Error>> {
info!("Initializing collectors...");
// Create SMART collector
if self.config.collectors.smart.enabled {
let smart_collector = SmartCollector::new(
self.config.collectors.smart.enabled,
self.config.collectors.smart.interval_ms,
self.config.collectors.smart.devices.clone(),
);
self.scheduler.add_collector(Arc::new(smart_collector));
info!("SMART collector initialized");
}
// Create Service collector
if self.config.collectors.service.enabled {
let service_collector = ServiceCollector::new(
self.config.collectors.service.enabled,
self.config.collectors.service.interval_ms,
self.config.collectors.service.services.clone(),
);
self.scheduler.add_collector(Arc::new(service_collector));
info!("Service collector initialized");
}
// Create Backup collector
if self.config.collectors.backup.enabled {
let backup_collector = BackupCollector::new(
self.config.collectors.backup.enabled,
self.config.collectors.backup.interval_ms,
self.config.collectors.backup.restic_repo.clone(),
self.config.collectors.backup.backup_service.clone(),
);
self.scheduler.add_collector(Arc::new(backup_collector));
info!("Backup collector initialized");
}
let enabled_count = self.config.get_enabled_collector_count();
if enabled_count == 0 {
return Err("No collectors are enabled".into());
}
info!("Initialized {} collectors", enabled_count);
Ok(())
}
pub async fn run(&mut self) -> Result<(), Box<dyn std::error::Error>> {
info!(
"Starting metrics agent for host '{}'",
self.config.agent.hostname
);
// Initialize health checker
let stats = self.scheduler.get_stats_handle();
self.health_checker = Some(HealthChecker::new(stats));
// Forward successful collection results to the publisher
let (metrics_tx, metrics_rx) = mpsc::unbounded_channel();
self.scheduler.set_metrics_sender(metrics_tx);
let publisher_task = self.start_publisher_task(metrics_rx)?;
// Start health monitoring task
let health_task = self.start_health_monitoring_task().await?;
// Start the collector scheduler (this will block)
let scheduler_result = self.scheduler.start().await;
// Drop the metrics sender so the publisher can exit cleanly
self.scheduler.clear_metrics_sender();
// Wait for background tasks to complete
if let Err(join_error) = health_task.await {
warn!("Health monitoring task ended unexpectedly: {}", join_error);
}
if let Err(join_error) = publisher_task.await {
warn!("Publisher task ended unexpectedly: {}", join_error);
}
match scheduler_result {
Ok(_) => {
info!("Agent shutdown completed successfully");
Ok(())
}
Err(e) => {
error!("Agent encountered an error: {}", e);
Err(e.into())
}
}
}
fn start_publisher_task(
&self,
mut metrics_rx: mpsc::UnboundedReceiver<CollectorOutput>,
) -> Result<tokio::task::JoinHandle<()>, Box<dyn std::error::Error>> {
let bind_address = format!(
"tcp://{}:{}",
self.config.zmq.bind_address, self.config.zmq.port
);
let send_timeout = self.config.zmq.send_timeout_ms as i32;
let hostname = self.config.agent.hostname.clone();
let handle = tokio::spawn(async move {
let context = Context::new();
let socket = match context.socket(SocketType::PUB) {
Ok(socket) => socket,
Err(error) => {
error!("Failed to create ZMQ PUB socket: {}", error);
return;
}
};
if let Err(error) = socket.set_sndtimeo(send_timeout) {
warn!("Failed to apply ZMQ send timeout: {}", error);
}
if let Err(error) = socket.bind(&bind_address) {
error!(
"Failed to bind ZMQ publisher to {}: {}",
bind_address, error
);
return;
}
info!("ZMQ publisher bound to {}", bind_address);
while let Some(output) = metrics_rx.recv().await {
let CollectorOutput {
agent_type,
data,
timestamp,
} = output;
let envelope_agent_type = match agent_type {
AgentType::Smart => cm_dashboard_shared::envelope::AgentType::Smart,
AgentType::Service => cm_dashboard_shared::envelope::AgentType::Service,
AgentType::Backup => cm_dashboard_shared::envelope::AgentType::Backup,
};
let epoch = timestamp.timestamp();
let epoch_u64 = if epoch < 0 { 0 } else { epoch as u64 };
let envelope = MessageEnvelope {
hostname: hostname.clone(),
agent_type: envelope_agent_type.clone(),
timestamp: epoch_u64,
metrics: data,
};
match serde_json::to_vec(&envelope) {
Ok(serialized) => {
if let Err(error) = socket.send(serialized, 0) {
warn!(
"Failed to publish {:?} metrics: {}",
envelope.agent_type, error
);
} else {
debug!(
"Published {:?} metrics for host {}",
envelope.agent_type, envelope.hostname
);
}
}
Err(error) => {
warn!("Failed to serialize metrics envelope: {}", error);
}
}
}
info!("Metrics publisher task shutting down");
});
Ok(handle)
}
async fn start_health_monitoring_task(
&self,
) -> Result<tokio::task::JoinHandle<()>, Box<dyn std::error::Error>> {
let health_checker = self.health_checker.as_ref().unwrap().clone();
let task = tokio::spawn(async move {
info!("Starting health monitoring task");
let mut health_interval = interval(Duration::from_secs(60)); // Check every minute
loop {
health_interval.tick().await;
match health_checker.check_health().await {
HealthStatus::Healthy => {
debug!("All collectors are healthy");
}
HealthStatus::Degraded {
degraded_collectors,
} => {
warn!("Degraded collectors: {:?}", degraded_collectors);
}
HealthStatus::Unhealthy {
unhealthy_collectors,
degraded_collectors,
} => {
error!(
"Unhealthy collectors: {:?}, Degraded: {:?}",
unhealthy_collectors, degraded_collectors
);
}
}
}
});
Ok(task)
}
pub async fn shutdown(&self) {
info!("Initiating graceful shutdown...");
self.scheduler.shutdown().await;
// ZMQ socket will be dropped automatically
info!("Agent shutdown completed");
}
}
impl Drop for MetricsAgent {
fn drop(&mut self) {
// ZMQ socket will be dropped automatically
}
}

View File

@ -0,0 +1,388 @@
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::process::Stdio;
use std::time::Duration;
use tokio::process::Command;
use tokio::time::timeout;
use super::{AgentType, Collector, CollectorError, CollectorOutput};
#[derive(Debug, Clone)]
pub struct BackupCollector {
pub enabled: bool,
pub interval: Duration,
pub restic_repo: Option<String>,
pub backup_service: String,
pub timeout_ms: u64,
}
impl BackupCollector {
pub fn new(
enabled: bool,
interval_ms: u64,
restic_repo: Option<String>,
backup_service: String,
) -> Self {
Self {
enabled,
interval: Duration::from_millis(interval_ms),
restic_repo,
backup_service,
timeout_ms: 30000, // 30 second timeout for backup operations
}
}
async fn get_restic_snapshots(&self) -> Result<ResticStats, CollectorError> {
let repo = self
.restic_repo
.as_ref()
.ok_or_else(|| CollectorError::ConfigError {
message: "No restic repository configured".to_string(),
})?;
let timeout_duration = Duration::from_millis(self.timeout_ms);
// Get restic snapshots
let output = timeout(
timeout_duration,
Command::new("restic")
.args(["-r", repo, "snapshots", "--json"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("restic -r {} snapshots --json", repo),
message: e.to_string(),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CollectorError::CommandFailed {
command: format!("restic -r {} snapshots --json", repo),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let snapshots: Vec<ResticSnapshot> =
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse restic snapshots: {}", e),
})?;
// Get repository stats
let stats_output = timeout(
timeout_duration,
Command::new("restic")
.args(["-r", repo, "stats", "--json"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("restic -r {} stats --json", repo),
message: e.to_string(),
})?;
let repo_size_gb = if stats_output.status.success() {
let stats_stdout = String::from_utf8_lossy(&stats_output.stdout);
let stats: Result<ResticStats, _> = serde_json::from_str(&stats_stdout);
stats
.ok()
.map(|s| s.total_size as f32 / (1024.0 * 1024.0 * 1024.0))
.unwrap_or(0.0)
} else {
0.0
};
// Find most recent snapshot
let last_success = snapshots.iter().map(|s| s.time).max();
Ok(ResticStats {
total_size: (repo_size_gb * 1024.0 * 1024.0 * 1024.0) as u64,
snapshot_count: snapshots.len() as u32,
last_success,
})
}
async fn get_backup_service_status(&self) -> Result<BackupServiceData, CollectorError> {
let timeout_duration = Duration::from_millis(self.timeout_ms);
// Get systemctl status for backup service
let status_output = timeout(
timeout_duration,
Command::new("systemctl")
.args([
"show",
&self.backup_service,
"--property=ActiveState,SubState,MainPID",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("systemctl show {}", self.backup_service),
message: e.to_string(),
})?;
let enabled = if status_output.status.success() {
let status_stdout = String::from_utf8_lossy(&status_output.stdout);
status_stdout.contains("ActiveState=active")
|| status_stdout.contains("SubState=running")
} else {
false
};
// Check for backup timer or service logs for last message
let last_message = self.get_last_backup_log_message().await.ok();
// Check for pending backup jobs (simplified - could check systemd timers)
let pending_jobs = 0; // TODO: Implement proper pending job detection
Ok(BackupServiceData {
enabled,
pending_jobs,
last_message,
})
}
async fn get_last_backup_log_message(&self) -> Result<String, CollectorError> {
let output = Command::new("journalctl")
.args([
"-u",
&self.backup_service,
"--lines=1",
"--no-pager",
"--output=cat",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!("journalctl -u {} --lines=1", self.backup_service),
message: e.to_string(),
})?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
let message = stdout.trim().to_string();
if !message.is_empty() {
return Ok(message);
}
}
Err(CollectorError::ParseError {
message: "No log messages found".to_string(),
})
}
async fn get_backup_logs_for_failures(&self) -> Result<Option<DateTime<Utc>>, CollectorError> {
let output = Command::new("journalctl")
.args([
"-u",
&self.backup_service,
"--since",
"1 week ago",
"--grep=failed\\|error\\|ERROR",
"--output=json",
"--lines=1",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!(
"journalctl -u {} --since='1 week ago' --grep=failed",
self.backup_service
),
message: e.to_string(),
})?;
if output.status.success() {
let stdout = String::from_utf8_lossy(&output.stdout);
if let Ok(log_entry) = serde_json::from_str::<JournalEntry>(&stdout) {
if let Ok(timestamp) = log_entry.realtime_timestamp.parse::<i64>() {
let dt =
DateTime::from_timestamp_micros(timestamp).unwrap_or_else(|| Utc::now());
return Ok(Some(dt));
}
}
}
Ok(None)
}
fn determine_backup_status(
&self,
restic_stats: &Result<ResticStats, CollectorError>,
service_data: &BackupServiceData,
last_failure: Option<DateTime<Utc>>,
) -> BackupStatus {
match restic_stats {
Ok(stats) => {
if let Some(last_success) = stats.last_success {
let hours_since_backup =
Utc::now().signed_duration_since(last_success).num_hours();
if hours_since_backup > 48 {
BackupStatus::Warning // More than 2 days since last backup
} else if let Some(failure) = last_failure {
if failure > last_success {
BackupStatus::Failed // Failure after last success
} else {
BackupStatus::Healthy
}
} else {
BackupStatus::Healthy
}
} else {
BackupStatus::Warning // No successful backups found
}
}
Err(_) => {
if service_data.enabled {
BackupStatus::Failed // Service enabled but can't access repo
} else {
BackupStatus::Unknown // Service disabled
}
}
}
}
}
#[async_trait]
impl Collector for BackupCollector {
fn name(&self) -> &str {
"backup"
}
fn agent_type(&self) -> AgentType {
AgentType::Backup
}
fn collect_interval(&self) -> Duration {
self.interval
}
fn is_enabled(&self) -> bool {
self.enabled
}
fn requires_root(&self) -> bool {
false // Depends on restic repo permissions
}
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
// Get restic repository stats
let restic_stats = self.get_restic_snapshots().await;
// Get backup service status
let service_data = self
.get_backup_service_status()
.await
.unwrap_or(BackupServiceData {
enabled: false,
pending_jobs: 0,
last_message: None,
});
// Check for recent failures
let last_failure = self.get_backup_logs_for_failures().await.unwrap_or(None);
// Determine overall backup status
let overall_status =
self.determine_backup_status(&restic_stats, &service_data, last_failure);
let (backup_info, _size_gb) = match &restic_stats {
Ok(stats) => (
BackupInfo {
last_success: stats.last_success,
last_failure,
size_gb: stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0),
snapshot_count: stats.snapshot_count,
},
stats.total_size as f32 / (1024.0 * 1024.0 * 1024.0),
),
Err(_) => (
BackupInfo {
last_success: None,
last_failure,
size_gb: 0.0,
snapshot_count: 0,
},
0.0,
),
};
let backup_metrics = json!({
"overall_status": overall_status,
"backup": backup_info,
"service": service_data,
"timestamp": Utc::now()
});
Ok(CollectorOutput {
agent_type: AgentType::Backup,
data: backup_metrics,
timestamp: Utc::now(),
})
}
}
#[derive(Debug, Deserialize)]
struct ResticSnapshot {
time: DateTime<Utc>,
}
#[derive(Debug, Deserialize)]
struct ResticStats {
total_size: u64,
snapshot_count: u32,
last_success: Option<DateTime<Utc>>,
}
#[derive(Debug, Serialize)]
struct BackupServiceData {
enabled: bool,
pending_jobs: u32,
last_message: Option<String>,
}
#[derive(Debug, Serialize)]
struct BackupInfo {
last_success: Option<DateTime<Utc>>,
last_failure: Option<DateTime<Utc>>,
size_gb: f32,
snapshot_count: u32,
}
#[derive(Debug, Serialize)]
enum BackupStatus {
Healthy,
Warning,
Failed,
Unknown,
}
#[derive(Debug, Deserialize)]
struct JournalEntry {
#[serde(rename = "__REALTIME_TIMESTAMP")]
realtime_timestamp: String,
}

View File

@ -0,0 +1,53 @@
use thiserror::Error;
#[derive(Debug, Error)]
pub enum CollectorError {
#[error("Command execution failed: {command} - {message}")]
CommandFailed { command: String, message: String },
#[error("Permission denied: {message}")]
PermissionDenied { message: String },
#[error("Data parsing error: {message}")]
ParseError { message: String },
#[error("Timeout after {duration_ms}ms")]
Timeout { duration_ms: u64 },
#[error("IO error: {message}")]
IoError { message: String },
#[error("Configuration error: {message}")]
ConfigError { message: String },
#[error("Service not found: {service}")]
ServiceNotFound { service: String },
#[error("Device not found: {device}")]
DeviceNotFound { device: String },
#[error("External dependency error: {dependency} - {message}")]
ExternalDependency { dependency: String, message: String },
}
impl From<std::io::Error> for CollectorError {
fn from(err: std::io::Error) -> Self {
CollectorError::IoError {
message: err.to_string(),
}
}
}
impl From<serde_json::Error> for CollectorError {
fn from(err: serde_json::Error) -> Self {
CollectorError::ParseError {
message: err.to_string(),
}
}
}
impl From<tokio::time::error::Elapsed> for CollectorError {
fn from(_: tokio::time::error::Elapsed) -> Self {
CollectorError::Timeout { duration_ms: 0 }
}
}

View File

@ -0,0 +1,49 @@
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use serde_json::Value;
use std::time::Duration;
pub mod backup;
pub mod error;
pub mod service;
pub mod smart;
pub use error::CollectorError;
#[derive(Debug, Clone)]
pub enum AgentType {
Smart,
Service,
Backup,
}
impl AgentType {
pub fn as_str(&self) -> &'static str {
match self {
AgentType::Smart => "smart",
AgentType::Service => "service",
AgentType::Backup => "backup",
}
}
}
#[derive(Debug, Clone)]
pub struct CollectorOutput {
pub agent_type: AgentType,
pub data: Value,
pub timestamp: DateTime<Utc>,
}
#[async_trait]
pub trait Collector: Send + Sync {
fn name(&self) -> &str;
fn agent_type(&self) -> AgentType;
fn collect_interval(&self) -> Duration;
async fn collect(&self) -> Result<CollectorOutput, CollectorError>;
fn is_enabled(&self) -> bool {
true
}
fn requires_root(&self) -> bool {
false
}
}

View File

@ -0,0 +1,603 @@
use async_trait::async_trait;
use chrono::Utc;
use serde::Serialize;
use serde_json::json;
use std::collections::HashMap;
use std::process::Stdio;
use std::time::Duration;
use tokio::fs;
use tokio::process::Command;
use tokio::time::timeout;
use super::{AgentType, Collector, CollectorError, CollectorOutput};
#[derive(Debug, Clone)]
pub struct ServiceCollector {
pub enabled: bool,
pub interval: Duration,
pub services: Vec<String>,
pub timeout_ms: u64,
}
impl ServiceCollector {
pub fn new(enabled: bool, interval_ms: u64, services: Vec<String>) -> Self {
Self {
enabled,
interval: Duration::from_millis(interval_ms),
services,
timeout_ms: 10000, // 10 second timeout for service checks
}
}
async fn get_service_status(&self, service: &str) -> Result<ServiceData, CollectorError> {
let timeout_duration = Duration::from_millis(self.timeout_ms);
// Get systemctl status
let status_output = timeout(
timeout_duration,
Command::new("systemctl")
.args(["show", service, "--property=ActiveState,SubState,MainPID"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?
.map_err(|e| CollectorError::CommandFailed {
command: format!("systemctl show {}", service),
message: e.to_string(),
})?;
if !status_output.status.success() {
return Err(CollectorError::ServiceNotFound {
service: service.to_string(),
});
}
let status_stdout = String::from_utf8_lossy(&status_output.stdout);
let mut active_state = None;
let mut sub_state = None;
let mut main_pid = None;
for line in status_stdout.lines() {
if let Some(value) = line.strip_prefix("ActiveState=") {
active_state = Some(value.to_string());
} else if let Some(value) = line.strip_prefix("SubState=") {
sub_state = Some(value.to_string());
} else if let Some(value) = line.strip_prefix("MainPID=") {
main_pid = value.parse::<u32>().ok();
}
}
let status = self.determine_service_status(&active_state, &sub_state);
// Get resource usage if service is running
let (memory_used_mb, cpu_percent) = if let Some(pid) = main_pid {
self.get_process_resources(pid).await.unwrap_or((0.0, 0.0))
} else {
(0.0, 0.0)
};
// Get memory quota from systemd if available
let memory_quota_mb = self.get_service_memory_limit(service).await.unwrap_or(0.0);
// Get disk usage for this service
let disk_used_gb = self.get_service_disk_usage(service).await.unwrap_or(0.0);
Ok(ServiceData {
name: service.to_string(),
status,
memory_used_mb,
memory_quota_mb,
cpu_percent,
sandbox_limit: None, // TODO: Implement sandbox limit detection
disk_used_gb,
})
}
fn determine_service_status(
&self,
active_state: &Option<String>,
sub_state: &Option<String>,
) -> ServiceStatus {
match (active_state.as_deref(), sub_state.as_deref()) {
(Some("active"), Some("running")) => ServiceStatus::Running,
(Some("active"), Some("exited")) => ServiceStatus::Running, // One-shot services
(Some("reloading"), _) | (Some("activating"), _) => ServiceStatus::Restarting,
(Some("failed"), _) | (Some("inactive"), Some("failed")) => ServiceStatus::Stopped,
(Some("inactive"), _) => ServiceStatus::Stopped,
_ => ServiceStatus::Degraded,
}
}
async fn get_process_resources(&self, pid: u32) -> Result<(f32, f32), CollectorError> {
// Read /proc/{pid}/stat for CPU and memory info
let stat_path = format!("/proc/{}/stat", pid);
let stat_content =
fs::read_to_string(&stat_path)
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?;
let stat_fields: Vec<&str> = stat_content.split_whitespace().collect();
if stat_fields.len() < 24 {
return Err(CollectorError::ParseError {
message: format!("Invalid /proc/{}/stat format", pid),
});
}
// Field 23 is RSS (Resident Set Size) in pages
let rss_pages: u64 = stat_fields[23]
.parse()
.map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse RSS from /proc/{}/stat: {}", pid, e),
})?;
// Convert pages to MB (assuming 4KB pages)
let memory_mb = (rss_pages * 4) as f32 / 1024.0;
// For CPU, we'd need to track over time - simplified to 0 for now
// TODO: Implement proper CPU percentage calculation
let cpu_percent = 0.0;
Ok((memory_mb, cpu_percent))
}
async fn get_service_disk_usage(&self, service: &str) -> Result<f32, CollectorError> {
// For systemd services, check if they have private /var directories or specific data paths
// This is a simplified implementation - could be enhanced to check actual service-specific paths
// Common service data directories to check
let potential_paths = vec![
format!("/var/lib/{}", service),
format!("/var/cache/{}", service),
format!("/var/log/{}", service),
format!("/opt/{}", service),
format!("/srv/{}", service),
];
let mut total_usage = 0.0;
for path in potential_paths {
if let Ok(usage) = self.get_directory_size(&path).await {
total_usage += usage;
}
}
Ok(total_usage)
}
async fn get_directory_size(&self, path: &str) -> Result<f32, CollectorError> {
let output = Command::new("du")
.args(["-s", "-k", path]) // Use kilobytes instead of forcing GB
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!("du -s -k {}", path),
message: e.to_string(),
})?;
if !output.status.success() {
// Directory doesn't exist or permission denied - return 0
return Ok(0.0);
}
let stdout = String::from_utf8_lossy(&output.stdout);
if let Some(line) = stdout.lines().next() {
if let Some(size_str) = line.split_whitespace().next() {
let size_kb = size_str.parse::<f32>().unwrap_or(0.0);
let size_gb = size_kb / (1024.0 * 1024.0); // Convert KB to GB
return Ok(size_gb);
}
}
Ok(0.0)
}
async fn get_service_memory_limit(&self, service: &str) -> Result<f32, CollectorError> {
let output = Command::new("systemctl")
.args(["show", service, "--property=MemoryMax"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!("systemctl show {} --property=MemoryMax", service),
message: e.to_string(),
})?;
let stdout = String::from_utf8_lossy(&output.stdout);
for line in stdout.lines() {
if let Some(value) = line.strip_prefix("MemoryMax=") {
if value == "infinity" {
return Ok(0.0); // No limit
}
if let Ok(bytes) = value.parse::<u64>() {
return Ok(bytes as f32 / (1024.0 * 1024.0)); // Convert to MB
}
}
}
Ok(0.0) // No limit or couldn't parse
}
async fn get_system_memory_info(&self) -> Result<SystemMemoryInfo, CollectorError> {
let meminfo =
fs::read_to_string("/proc/meminfo")
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?;
let mut memory_info = HashMap::new();
for line in meminfo.lines() {
if let Some((key, value)) = line.split_once(':') {
let value = value.trim().trim_end_matches(" kB");
if let Ok(kb) = value.parse::<u64>() {
memory_info.insert(key.to_string(), kb);
}
}
}
let total_kb = memory_info.get("MemTotal").copied().unwrap_or(0);
let available_kb = memory_info.get("MemAvailable").copied().unwrap_or(0);
let used_kb = total_kb.saturating_sub(available_kb);
Ok(SystemMemoryInfo {
total_mb: total_kb as f32 / 1024.0,
used_mb: used_kb as f32 / 1024.0,
})
}
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
let output = Command::new("df")
.args(["-BG", "--output=size,used,avail", "/"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "df -BG --output=size,used,avail /".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CollectorError::CommandFailed {
command: "df -BG --output=size,used,avail /".to_string(),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let lines: Vec<&str> = stdout.lines().collect();
if lines.len() < 2 {
return Err(CollectorError::ParseError {
message: "Unexpected df output format".to_string(),
});
}
let data_line = lines[1].trim();
let parts: Vec<&str> = data_line.split_whitespace().collect();
if parts.len() < 3 {
return Err(CollectorError::ParseError {
message: format!("Unexpected df data format: {}", data_line),
});
}
let parse_size = |s: &str| -> Result<f32, CollectorError> {
s.trim_end_matches('G')
.parse::<f32>()
.map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse disk size '{}': {}", s, e),
})
};
Ok(DiskUsage {
total_gb: parse_size(parts[0])?,
used_gb: parse_size(parts[1])?,
})
}
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
let loadavg =
fs::read_to_string("/proc/loadavg")
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?;
let parts: Vec<&str> = loadavg.split_whitespace().collect();
if parts.len() < 3 {
return Err(CollectorError::ParseError {
message: "Unexpected /proc/loadavg format".to_string(),
});
}
let parse = |s: &str| -> Result<f32, CollectorError> {
s.parse::<f32>().map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse load average '{}': {}", s, e),
})
};
Ok((parse(parts[0])?, parse(parts[1])?, parse(parts[2])?))
}
async fn get_cpu_frequency_mhz(&self) -> Option<f32> {
let candidates = [
"/sys/devices/system/cpu/cpufreq/policy0/scaling_cur_freq",
"/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq",
];
for path in candidates {
if let Ok(content) = fs::read_to_string(path).await {
if let Ok(khz) = content.trim().parse::<f32>() {
if khz > 0.0 {
return Some(khz / 1000.0);
}
}
}
}
if let Ok(content) = fs::read_to_string("/proc/cpuinfo").await {
for line in content.lines() {
if let Some(rest) = line.strip_prefix("cpu MHz") {
if let Some(value) = rest.split(':').nth(1) {
if let Ok(mhz) = value.trim().parse::<f32>() {
if mhz > 0.0 {
return Some(mhz);
}
}
}
}
}
}
None
}
async fn get_cpu_temperature_c(&self) -> Option<f32> {
let mut entries = fs::read_dir("/sys/class/thermal").await.ok()?;
let mut fallback: Option<f32> = None;
while let Ok(Some(entry)) = entries.next_entry().await {
let path = entry.path();
let type_path = path.join("type");
let temp_path = path.join("temp");
let label = fs::read_to_string(&type_path).await.ok()?.to_lowercase();
let raw = match fs::read_to_string(&temp_path).await {
Ok(value) => value,
Err(_) => continue,
};
let milli: f32 = match raw.trim().parse() {
Ok(value) => value,
Err(_) => continue,
};
let temp_c = milli / 1000.0;
if label.contains("cpu") || label.contains("pkg") {
if temp_c > 0.0 {
return Some(temp_c);
}
}
if fallback.is_none() && temp_c > 0.0 {
fallback = Some(temp_c);
}
}
fallback
}
async fn get_gpu_metrics(&self) -> (Option<f32>, Option<f32>) {
let output = Command::new("nvidia-smi")
.args([
"--query-gpu=utilization.gpu,temperature.gpu",
"--format=csv,noheader,nounits",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await;
match output {
Ok(result) if result.status.success() => {
let stdout = String::from_utf8_lossy(&result.stdout);
if let Some(line) = stdout.lines().next() {
let parts: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
if parts.len() >= 2 {
let load = parts[0].parse::<f32>().ok();
let temp = parts[1].parse::<f32>().ok();
return (load, temp);
}
}
(None, None)
}
Ok(_) | Err(_) => {
let util_output = Command::new("/opt/vc/bin/vcgencmd")
.arg("measure_temp")
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await;
if let Ok(result) = util_output {
if result.status.success() {
let stdout = String::from_utf8_lossy(&result.stdout);
if let Some(value) = stdout
.trim()
.strip_prefix("temp=")
.and_then(|s| s.strip_suffix("'C"))
{
if let Ok(temp_c) = value.parse::<f32>() {
return (None, Some(temp_c));
}
}
}
}
(None, None)
}
}
}
}
#[async_trait]
impl Collector for ServiceCollector {
fn name(&self) -> &str {
"service"
}
fn agent_type(&self) -> AgentType {
AgentType::Service
}
fn collect_interval(&self) -> Duration {
self.interval
}
fn is_enabled(&self) -> bool {
self.enabled
}
fn requires_root(&self) -> bool {
false // Most systemctl commands work without root
}
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
let mut services = Vec::new();
let mut healthy = 0;
let mut degraded = 0;
let mut failed = 0;
let mut total_memory_used = 0.0;
let mut total_memory_quota = 0.0;
let mut total_disk_used = 0.0;
// Collect data from all configured services
for service in &self.services {
match self.get_service_status(service).await {
Ok(service_data) => {
match service_data.status {
ServiceStatus::Running => healthy += 1,
ServiceStatus::Degraded | ServiceStatus::Restarting => degraded += 1,
ServiceStatus::Stopped => failed += 1,
}
total_memory_used += service_data.memory_used_mb;
if service_data.memory_quota_mb > 0.0 {
total_memory_quota += service_data.memory_quota_mb;
}
total_disk_used += service_data.disk_used_gb;
services.push(service_data);
}
Err(e) => {
failed += 1;
// Add a placeholder service entry for failed collection
services.push(ServiceData {
name: service.clone(),
status: ServiceStatus::Stopped,
memory_used_mb: 0.0,
memory_quota_mb: 0.0,
cpu_percent: 0.0,
sandbox_limit: None,
disk_used_gb: 0.0,
});
tracing::warn!("Failed to collect metrics for service {}: {}", service, e);
}
}
}
// Get system memory info for quota calculation
let system_memory = self
.get_system_memory_info()
.await
.unwrap_or(SystemMemoryInfo {
total_mb: 0.0,
used_mb: 0.0,
});
let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage {
total_gb: 0.0,
used_gb: 0.0,
});
let (cpu_load_1, cpu_load_5, cpu_load_15) =
self.get_cpu_load().await.unwrap_or((0.0, 0.0, 0.0));
let cpu_freq_mhz = self.get_cpu_frequency_mhz().await;
let cpu_temp_c = self.get_cpu_temperature_c().await;
let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await;
// If no specific quotas are set, use system memory as reference
if total_memory_quota == 0.0 {
total_memory_quota = system_memory.total_mb;
}
let service_metrics = json!({
"summary": {
"healthy": healthy,
"degraded": degraded,
"failed": failed,
"memory_used_mb": total_memory_used,
"memory_quota_mb": total_memory_quota,
"system_memory_used_mb": system_memory.used_mb,
"system_memory_total_mb": system_memory.total_mb,
"disk_used_gb": total_disk_used,
"disk_total_gb": total_disk_used, // For services, total = used (no quota concept)
"cpu_load_1": cpu_load_1,
"cpu_load_5": cpu_load_5,
"cpu_load_15": cpu_load_15,
"cpu_freq_mhz": cpu_freq_mhz,
"cpu_temp_c": cpu_temp_c,
"gpu_load_percent": gpu_load_percent,
"gpu_temp_c": gpu_temp_c,
},
"services": services,
"timestamp": Utc::now()
});
Ok(CollectorOutput {
agent_type: AgentType::Service,
data: service_metrics,
timestamp: Utc::now(),
})
}
}
#[derive(Debug, Clone, Serialize)]
struct ServiceData {
name: String,
status: ServiceStatus,
memory_used_mb: f32,
memory_quota_mb: f32,
cpu_percent: f32,
sandbox_limit: Option<f32>,
disk_used_gb: f32,
}
#[derive(Debug, Clone, Serialize)]
enum ServiceStatus {
Running,
Degraded,
Restarting,
Stopped,
}
struct SystemMemoryInfo {
total_mb: f32,
used_mb: f32,
}
#[allow(dead_code)]
struct DiskUsage {
total_gb: f32,
used_gb: f32,
}

View File

@ -0,0 +1,447 @@
use async_trait::async_trait;
use chrono::Utc;
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::io::ErrorKind;
use std::process::Stdio;
use std::time::Duration;
use tokio::process::Command;
use tokio::time::timeout;
use super::{AgentType, Collector, CollectorError, CollectorOutput};
#[derive(Debug, Clone)]
pub struct SmartCollector {
pub enabled: bool,
pub interval: Duration,
pub devices: Vec<String>,
pub timeout_ms: u64,
}
impl SmartCollector {
pub fn new(enabled: bool, interval_ms: u64, devices: Vec<String>) -> Self {
Self {
enabled,
interval: Duration::from_millis(interval_ms),
devices,
timeout_ms: 30000, // 30 second timeout for smartctl
}
}
async fn get_smart_data(&self, device: &str) -> Result<SmartDeviceData, CollectorError> {
let timeout_duration = Duration::from_millis(self.timeout_ms);
let command_result = timeout(
timeout_duration,
Command::new("smartctl")
.args(["-a", "-j", &format!("/dev/{}", device)])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output(),
)
.await
.map_err(|_| CollectorError::Timeout {
duration_ms: self.timeout_ms,
})?;
let output = command_result.map_err(|e| match e.kind() {
ErrorKind::NotFound => CollectorError::ExternalDependency {
dependency: "smartctl".to_string(),
message: e.to_string(),
},
ErrorKind::PermissionDenied => CollectorError::PermissionDenied {
message: e.to_string(),
},
_ => CollectorError::CommandFailed {
command: format!("smartctl -a -j /dev/{}", device),
message: e.to_string(),
},
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
let stderr_lower = stderr.to_lowercase();
if stderr_lower.contains("permission denied") {
return Err(CollectorError::PermissionDenied {
message: stderr.to_string(),
});
}
if stderr_lower.contains("no such device") || stderr_lower.contains("cannot open") {
return Err(CollectorError::DeviceNotFound {
device: device.to_string(),
});
}
return Err(CollectorError::CommandFailed {
command: format!("smartctl -a -j /dev/{}", device),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let smart_output: SmartCtlOutput =
serde_json::from_str(&stdout).map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse smartctl output for {}: {}", device, e),
})?;
Ok(SmartDeviceData::from_smartctl_output(device, smart_output))
}
async fn get_drive_usage(&self, device: &str) -> Result<(Option<f32>, Option<f32>), CollectorError> {
// Get capacity first
let capacity = match self.get_drive_capacity(device).await {
Ok(cap) => Some(cap),
Err(_) => None,
};
// Try to get usage information
// For simplicity, we'll use the root filesystem usage for now
// In the future, this could be enhanced to map drives to specific mount points
let usage = if device.contains("nvme0n1") || device.contains("sda") {
// This is likely the main system drive, use root filesystem usage
match self.get_disk_usage().await {
Ok(disk_usage) => Some(disk_usage.used_gb),
Err(_) => None,
}
} else {
// For other drives, we don't have usage info yet
None
};
Ok((capacity, usage))
}
async fn get_drive_capacity(&self, device: &str) -> Result<f32, CollectorError> {
let output = Command::new("lsblk")
.args(["-J", "-o", "NAME,SIZE", &format!("/dev/{}", device)])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
message: e.to_string(),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CollectorError::CommandFailed {
command: format!("lsblk -J -o NAME,SIZE /dev/{}", device),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let lsblk_output: serde_json::Value = serde_json::from_str(&stdout)
.map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse lsblk JSON: {}", e),
})?;
// Extract size from the first blockdevice
if let Some(blockdevices) = lsblk_output["blockdevices"].as_array() {
if let Some(device_info) = blockdevices.first() {
if let Some(size_str) = device_info["size"].as_str() {
return self.parse_lsblk_size(size_str);
}
}
}
Err(CollectorError::ParseError {
message: format!("No size information found for device {}", device),
})
}
fn parse_lsblk_size(&self, size_str: &str) -> Result<f32, CollectorError> {
// Parse sizes like "953,9G", "1T", "512M"
let size_str = size_str.replace(',', "."); // Handle European decimal separator
if let Some(pos) = size_str.find(|c: char| c.is_alphabetic()) {
let (number_part, unit_part) = size_str.split_at(pos);
let number: f32 = number_part.parse()
.map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse size number '{}': {}", number_part, e),
})?;
let multiplier = match unit_part.to_uppercase().as_str() {
"T" | "TB" => 1024.0,
"G" | "GB" => 1.0,
"M" | "MB" => 1.0 / 1024.0,
"K" | "KB" => 1.0 / (1024.0 * 1024.0),
_ => return Err(CollectorError::ParseError {
message: format!("Unknown size unit: {}", unit_part),
}),
};
Ok(number * multiplier)
} else {
Err(CollectorError::ParseError {
message: format!("Invalid size format: {}", size_str),
})
}
}
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
let output = Command::new("df")
.args(["-BG", "--output=size,used,avail", "/"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "df -BG --output=size,used,avail /".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(CollectorError::CommandFailed {
command: "df -BG --output=size,used,avail /".to_string(),
message: stderr.to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let lines: Vec<&str> = stdout.lines().collect();
if lines.len() < 2 {
return Err(CollectorError::ParseError {
message: "Unexpected df output format".to_string(),
});
}
// Skip header line, parse data line
let data_line = lines[1].trim();
let parts: Vec<&str> = data_line.split_whitespace().collect();
if parts.len() < 3 {
return Err(CollectorError::ParseError {
message: format!("Unexpected df data format: {}", data_line),
});
}
let parse_size = |s: &str| -> Result<f32, CollectorError> {
s.trim_end_matches('G')
.parse::<f32>()
.map_err(|e| CollectorError::ParseError {
message: format!("Failed to parse disk size '{}': {}", s, e),
})
};
Ok(DiskUsage {
total_gb: parse_size(parts[0])?,
used_gb: parse_size(parts[1])?,
available_gb: parse_size(parts[2])?,
})
}
}
#[async_trait]
impl Collector for SmartCollector {
fn name(&self) -> &str {
"smart"
}
fn agent_type(&self) -> AgentType {
AgentType::Smart
}
fn collect_interval(&self) -> Duration {
self.interval
}
fn is_enabled(&self) -> bool {
self.enabled
}
fn requires_root(&self) -> bool {
true // smartctl typically requires root access
}
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
let mut drives = Vec::new();
let mut issues = Vec::new();
let mut healthy = 0;
let mut warning = 0;
let mut critical = 0;
// Collect data from all configured devices
for device in &self.devices {
match self.get_smart_data(device).await {
Ok(mut drive_data) => {
// Try to get capacity and usage for this drive
if let Ok((capacity, usage)) = self.get_drive_usage(device).await {
drive_data.capacity_gb = capacity;
drive_data.used_gb = usage;
}
match drive_data.health_status.as_str() {
"PASSED" => healthy += 1,
"FAILED" => {
critical += 1;
issues.push(format!("{}: SMART status FAILED", device));
}
_ => {
warning += 1;
issues.push(format!("{}: Unknown SMART status", device));
}
}
drives.push(drive_data);
}
Err(e) => {
warning += 1;
issues.push(format!("{}: {}", device, e));
}
}
}
// Get disk usage information
let disk_usage = self.get_disk_usage().await?;
let status = if critical > 0 {
"CRITICAL"
} else if warning > 0 {
"WARNING"
} else {
"HEALTHY"
};
let smart_metrics = json!({
"status": status,
"drives": drives,
"summary": {
"healthy": healthy,
"warning": warning,
"critical": critical,
"capacity_total_gb": disk_usage.total_gb,
"capacity_used_gb": disk_usage.used_gb,
"capacity_available_gb": disk_usage.available_gb
},
"issues": issues,
"timestamp": Utc::now()
});
Ok(CollectorOutput {
agent_type: AgentType::Smart,
data: smart_metrics,
timestamp: Utc::now(),
})
}
}
#[derive(Debug, Clone, Serialize)]
struct SmartDeviceData {
name: String,
temperature_c: f32,
wear_level: f32,
power_on_hours: u64,
available_spare: f32,
health_status: String,
capacity_gb: Option<f32>,
used_gb: Option<f32>,
}
impl SmartDeviceData {
fn from_smartctl_output(device: &str, output: SmartCtlOutput) -> Self {
let temperature_c = output.temperature.and_then(|t| t.current).unwrap_or(0.0);
let wear_level = output
.nvme_smart_health_information_log
.as_ref()
.and_then(|nvme| nvme.percentage_used)
.unwrap_or(0.0);
let power_on_hours = output.power_on_time.and_then(|p| p.hours).unwrap_or(0);
let available_spare = output
.nvme_smart_health_information_log
.as_ref()
.and_then(|nvme| nvme.available_spare)
.unwrap_or(100.0);
let health_status = output
.smart_status
.and_then(|s| s.passed)
.map(|passed| {
if passed {
"PASSED".to_string()
} else {
"FAILED".to_string()
}
})
.unwrap_or_else(|| "UNKNOWN".to_string());
Self {
name: device.to_string(),
temperature_c,
wear_level,
power_on_hours,
available_spare,
health_status,
capacity_gb: None, // Will be set later by the collector
used_gb: None, // Will be set later by the collector
}
}
}
#[derive(Debug, Clone)]
struct DiskUsage {
total_gb: f32,
used_gb: f32,
available_gb: f32,
}
// Minimal smartctl JSON output structure - only the fields we need
#[derive(Debug, Deserialize)]
struct SmartCtlOutput {
temperature: Option<Temperature>,
power_on_time: Option<PowerOnTime>,
smart_status: Option<SmartStatus>,
nvme_smart_health_information_log: Option<NvmeSmartLog>,
}
#[derive(Debug, Deserialize)]
struct Temperature {
current: Option<f32>,
}
#[derive(Debug, Deserialize)]
struct PowerOnTime {
hours: Option<u64>,
}
#[derive(Debug, Deserialize)]
struct SmartStatus {
passed: Option<bool>,
}
#[derive(Debug, Deserialize)]
struct NvmeSmartLog {
percentage_used: Option<f32>,
available_spare: Option<f32>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_lsblk_size() {
let collector = SmartCollector::new(true, 5000, vec![]);
// Test gigabyte sizes
assert!((collector.parse_lsblk_size("953,9G").unwrap() - 953.9).abs() < 0.1);
assert!((collector.parse_lsblk_size("1G").unwrap() - 1.0).abs() < 0.1);
// Test terabyte sizes
assert!((collector.parse_lsblk_size("1T").unwrap() - 1024.0).abs() < 0.1);
assert!((collector.parse_lsblk_size("2,5T").unwrap() - 2560.0).abs() < 0.1);
// Test megabyte sizes
assert!((collector.parse_lsblk_size("512M").unwrap() - 0.5).abs() < 0.1);
// Test error cases
assert!(collector.parse_lsblk_size("invalid").is_err());
assert!(collector.parse_lsblk_size("1X").is_err());
}
}

315
agent/src/config.rs Normal file
View File

@ -0,0 +1,315 @@
use serde::{Deserialize, Serialize};
use std::path::Path;
use tokio::fs;
use tracing::info;
use crate::collectors::CollectorError;
use crate::discovery::AutoDiscovery;
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct AgentConfig {
pub agent: AgentSettings,
pub zmq: ZmqSettings,
pub collectors: CollectorsConfig,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct AgentSettings {
pub hostname: String,
pub log_level: String,
pub metrics_buffer_size: usize,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ZmqSettings {
pub port: u16,
pub bind_address: String,
pub send_timeout_ms: u64,
pub receive_timeout_ms: u64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct CollectorsConfig {
pub smart: SmartCollectorConfig,
pub service: ServiceCollectorConfig,
pub backup: BackupCollectorConfig,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SmartCollectorConfig {
pub enabled: bool,
pub interval_ms: u64,
pub devices: Vec<String>,
pub timeout_ms: u64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ServiceCollectorConfig {
pub enabled: bool,
pub interval_ms: u64,
pub services: Vec<String>,
pub timeout_ms: u64,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct BackupCollectorConfig {
pub enabled: bool,
pub interval_ms: u64,
pub restic_repo: Option<String>,
pub backup_service: String,
pub timeout_ms: u64,
}
impl Default for AgentConfig {
fn default() -> Self {
Self {
agent: AgentSettings {
hostname: gethostname::gethostname().to_string_lossy().to_string(),
log_level: "info".to_string(),
metrics_buffer_size: 1000,
},
zmq: ZmqSettings {
port: 6130,
bind_address: "0.0.0.0".to_string(),
send_timeout_ms: 5000,
receive_timeout_ms: 5000,
},
collectors: CollectorsConfig {
smart: SmartCollectorConfig {
enabled: true,
interval_ms: 5000,
devices: vec!["nvme0n1".to_string()],
timeout_ms: 30000,
},
service: ServiceCollectorConfig {
enabled: true,
interval_ms: 2000,
services: vec![
"gitea".to_string(),
"immich".to_string(),
"vaultwarden".to_string(),
"unifi".to_string(),
],
timeout_ms: 10000,
},
backup: BackupCollectorConfig {
enabled: true,
interval_ms: 30000,
restic_repo: None,
backup_service: "restic-backup".to_string(),
timeout_ms: 30000,
},
},
}
}
}
impl AgentConfig {
pub async fn load_from_file<P: AsRef<Path>>(path: P) -> Result<Self, CollectorError> {
let content = fs::read_to_string(path)
.await
.map_err(|e| CollectorError::ConfigError {
message: format!("Failed to read config file: {}", e),
})?;
let config: AgentConfig =
toml::from_str(&content).map_err(|e| CollectorError::ConfigError {
message: format!("Failed to parse config file: {}", e),
})?;
config.validate()?;
Ok(config)
}
pub async fn save_to_file<P: AsRef<Path>>(&self, path: P) -> Result<(), CollectorError> {
let content = toml::to_string_pretty(self).map_err(|e| CollectorError::ConfigError {
message: format!("Failed to serialize config: {}", e),
})?;
fs::write(path, content)
.await
.map_err(|e| CollectorError::ConfigError {
message: format!("Failed to write config file: {}", e),
})?;
Ok(())
}
pub fn validate(&self) -> Result<(), CollectorError> {
// Validate ZMQ settings
if self.zmq.port == 0 {
return Err(CollectorError::ConfigError {
message: "ZMQ port cannot be 0".to_string(),
});
}
// Validate collector intervals
if self.collectors.smart.enabled && self.collectors.smart.interval_ms < 1000 {
return Err(CollectorError::ConfigError {
message: "SMART collector interval must be at least 1000ms".to_string(),
});
}
if self.collectors.service.enabled && self.collectors.service.interval_ms < 500 {
return Err(CollectorError::ConfigError {
message: "Service collector interval must be at least 500ms".to_string(),
});
}
if self.collectors.backup.enabled && self.collectors.backup.interval_ms < 5000 {
return Err(CollectorError::ConfigError {
message: "Backup collector interval must be at least 5000ms".to_string(),
});
}
// Validate smart devices
if self.collectors.smart.enabled && self.collectors.smart.devices.is_empty() {
return Err(CollectorError::ConfigError {
message: "SMART collector requires at least one device".to_string(),
});
}
// Validate services
if self.collectors.service.enabled && self.collectors.service.services.is_empty() {
return Err(CollectorError::ConfigError {
message: "Service collector requires at least one service".to_string(),
});
}
// Validate backup configuration
if self.collectors.backup.enabled {
if self.collectors.backup.restic_repo.is_none() {
tracing::warn!("Backup collector enabled but no restic repository configured");
}
if self.collectors.backup.backup_service.is_empty() {
return Err(CollectorError::ConfigError {
message: "Backup collector requires a backup service name".to_string(),
});
}
}
Ok(())
}
pub fn get_enabled_collector_count(&self) -> usize {
let mut count = 0;
if self.collectors.smart.enabled {
count += 1;
}
if self.collectors.service.enabled {
count += 1;
}
if self.collectors.backup.enabled {
count += 1;
}
count
}
pub async fn auto_configure(&mut self) -> Result<(), CollectorError> {
let hostname = &self.agent.hostname.clone();
info!("Auto-configuring agent for host: {}", hostname);
// Auto-detect storage devices
let devices = AutoDiscovery::discover_storage_devices().await;
let valid_devices = AutoDiscovery::validate_devices(&devices).await;
if !valid_devices.is_empty() {
self.collectors.smart.devices = valid_devices;
info!(
"Auto-detected storage devices: {:?}",
self.collectors.smart.devices
);
} else {
info!("No accessible storage devices found, disabling SMART collector");
self.collectors.smart.enabled = false;
}
// Auto-detect services
let services = AutoDiscovery::discover_services().await;
if !services.is_empty() {
self.collectors.service.services = services;
info!(
"Auto-detected services: {:?}",
self.collectors.service.services
);
} else {
info!("No monitorable services found, using minimal service list");
self.collectors.service.services = vec!["ssh".to_string()];
}
// Auto-detect backup configuration
let (backup_enabled, restic_repo, backup_service) =
AutoDiscovery::discover_backup_config(hostname).await;
self.collectors.backup.enabled = backup_enabled;
self.collectors.backup.restic_repo = restic_repo;
self.collectors.backup.backup_service = backup_service;
if backup_enabled {
info!(
"Auto-configured backup monitoring: repo={:?}, service={}",
self.collectors.backup.restic_repo, self.collectors.backup.backup_service
);
} else {
info!("Backup monitoring disabled for this host");
}
// Apply host-specific timing optimizations
self.apply_host_timing_overrides(hostname);
Ok(())
}
fn apply_host_timing_overrides(&mut self, hostname: &str) {
match hostname {
"srv01" => {
// Server host - more frequent monitoring
self.collectors.service.interval_ms = 1000;
self.collectors.smart.interval_ms = 5000;
}
"cmbox" | "labbox" | "simonbox" | "steambox" => {
// Workstation hosts - less frequent monitoring
self.collectors.smart.interval_ms = 10000;
self.collectors.service.interval_ms = 5000;
}
_ => {
// Unknown host - conservative defaults
self.collectors.smart.interval_ms = 10000;
self.collectors.service.interval_ms = 5000;
}
}
info!(
"Applied timing overrides for {}: smart={}ms, service={}ms",
hostname, self.collectors.smart.interval_ms, self.collectors.service.interval_ms
);
}
pub fn summary(&self) -> String {
let mut parts = Vec::new();
if self.collectors.smart.enabled {
parts.push(format!(
"SMART({} devices)",
self.collectors.smart.devices.len()
));
}
if self.collectors.service.enabled {
parts.push(format!(
"Services({} monitored)",
self.collectors.service.services.len()
));
}
if self.collectors.backup.enabled {
parts.push("Backup".to_string());
}
if parts.is_empty() {
"No collectors enabled".to_string()
} else {
parts.join(", ")
}
}
}

449
agent/src/discovery.rs Normal file
View File

@ -0,0 +1,449 @@
use std::collections::HashSet;
use std::process::Stdio;
use tokio::fs;
use tokio::process::Command;
use tracing::{debug, warn};
use crate::collectors::CollectorError;
pub struct AutoDiscovery;
impl AutoDiscovery {
/// Auto-detect storage devices suitable for SMART monitoring
pub async fn discover_storage_devices() -> Vec<String> {
let mut devices = Vec::new();
// Method 1: Try lsblk to find block devices
if let Ok(lsblk_devices) = Self::discover_via_lsblk().await {
devices.extend(lsblk_devices);
}
// Method 2: Scan /dev for common device patterns
if devices.is_empty() {
if let Ok(dev_devices) = Self::discover_via_dev_scan().await {
devices.extend(dev_devices);
}
}
// Method 3: Fallback to common device names
if devices.is_empty() {
devices = Self::fallback_device_names();
}
// Remove duplicates and sort
let mut unique_devices: Vec<String> = devices
.into_iter()
.collect::<HashSet<_>>()
.into_iter()
.collect();
unique_devices.sort();
debug!("Auto-detected storage devices: {:?}", unique_devices);
unique_devices
}
async fn discover_via_lsblk() -> Result<Vec<String>, CollectorError> {
let output = Command::new("lsblk")
.args(["-d", "-o", "NAME,TYPE", "-n", "-r"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "lsblk".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
return Err(CollectorError::CommandFailed {
command: "lsblk".to_string(),
message: String::from_utf8_lossy(&output.stderr).to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut devices = Vec::new();
for line in stdout.lines() {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let device_name = parts[0];
let device_type = parts[1];
// Include disk type devices and filter out unwanted ones
if device_type == "disk" && Self::is_suitable_device(device_name) {
devices.push(device_name.to_string());
}
}
}
Ok(devices)
}
async fn discover_via_dev_scan() -> Result<Vec<String>, CollectorError> {
let mut devices = Vec::new();
// Read /dev directory
let mut dev_entries = fs::read_dir("/dev")
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?;
while let Some(entry) =
dev_entries
.next_entry()
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?
{
let file_name = entry.file_name();
let device_name = file_name.to_string_lossy();
if Self::is_suitable_device(&device_name) {
devices.push(device_name.to_string());
}
}
Ok(devices)
}
fn is_suitable_device(device_name: &str) -> bool {
// Include NVMe, SATA, and other storage devices
// Exclude partitions, loop devices, etc.
(device_name.starts_with("nvme") && device_name.contains("n") && !device_name.contains("p")) ||
(device_name.starts_with("sd") && device_name.len() == 3) || // sda, sdb, etc. not sda1
(device_name.starts_with("hd") && device_name.len() == 3) || // hda, hdb, etc.
(device_name.starts_with("vd") && device_name.len() == 3) // vda, vdb for VMs
}
fn fallback_device_names() -> Vec<String> {
vec!["nvme0n1".to_string(), "sda".to_string(), "sdb".to_string()]
}
/// Auto-detect systemd services suitable for monitoring
pub async fn discover_services() -> Vec<String> {
let mut services = Vec::new();
// Method 1: Try to find running services
if let Ok(running_services) = Self::discover_running_services().await {
services.extend(running_services);
}
// Method 2: Add host-specific services based on hostname
let hostname = gethostname::gethostname().to_string_lossy().to_string();
services.extend(Self::get_host_specific_services(&hostname));
// Normalize aliases and verify the units actually exist before deduping
let canonicalized: Vec<String> = services
.into_iter()
.filter_map(|svc| Self::canonical_service_name(&svc))
.collect();
let existing = Self::filter_existing_services(&canonicalized).await;
let mut unique_services: Vec<String> = existing
.into_iter()
.collect::<HashSet<_>>()
.into_iter()
.collect();
unique_services.sort();
debug!("Auto-detected services: {:?}", unique_services);
unique_services
}
async fn discover_running_services() -> Result<Vec<String>, CollectorError> {
let output = Command::new("systemctl")
.args([
"list-units",
"--type=service",
"--state=active",
"--no-pager",
"--no-legend",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "systemctl list-units".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
return Err(CollectorError::CommandFailed {
command: "systemctl list-units".to_string(),
message: String::from_utf8_lossy(&output.stderr).to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut services = Vec::new();
for line in stdout.lines() {
let parts: Vec<&str> = line.split_whitespace().collect();
if !parts.is_empty() {
let service_name = parts[0];
// Remove .service suffix if present
let clean_name = service_name
.strip_suffix(".service")
.unwrap_or(service_name);
// Only include services we're interested in monitoring
if Self::is_monitorable_service(clean_name) {
services.push(clean_name.to_string());
}
}
}
Ok(services)
}
fn is_monitorable_service(service_name: &str) -> bool {
// Define patterns for services we want to monitor
let interesting_services = [
// Web applications
"gitea",
"immich",
"vaultwarden",
"unifi",
"wordpress",
"nginx",
"apache2",
"httpd",
"caddy",
// Databases
"postgresql",
"mysql",
"mariadb",
"redis",
"mongodb",
// Monitoring and infrastructure
"smart-metrics-api",
"service-metrics-api",
"backup-metrics-api",
"prometheus",
"grafana",
"influxdb",
// Backup and storage
"restic",
"borg",
"rclone",
"syncthing",
// Container runtimes
"docker",
"podman",
"containerd",
// Network services
"sshd",
"dnsmasq",
"bind9",
"pihole",
// Media services
"plex",
"jellyfin",
"emby",
"sonarr",
"radarr",
];
// Check if service name contains any of our interesting patterns
interesting_services
.iter()
.any(|&pattern| service_name.contains(pattern) || pattern.contains(service_name))
}
fn get_host_specific_services(hostname: &str) -> Vec<String> {
match hostname {
"srv01" => vec![
"gitea".to_string(),
"immich".to_string(),
"vaultwarden".to_string(),
"unifi".to_string(),
"smart-metrics-api".to_string(),
"service-metrics-api".to_string(),
"backup-metrics-api".to_string(),
],
"cmbox" | "labbox" | "simonbox" => vec!["docker".to_string(), "sshd".to_string()],
"steambox" => vec!["steam".to_string(), "sshd".to_string()],
_ => vec!["sshd".to_string()],
}
}
fn canonical_service_name(service: &str) -> Option<String> {
let trimmed = service.trim();
if trimmed.is_empty() {
return None;
}
let lower = trimmed.to_lowercase();
let aliases = [
("ssh", "sshd"),
("sshd", "sshd"),
("docker.service", "docker"),
];
for (alias, target) in aliases {
if lower == alias {
return Some(target.to_string());
}
}
Some(trimmed.to_string())
}
async fn filter_existing_services(services: &[String]) -> Vec<String> {
let mut existing = Vec::new();
for service in services {
if Self::service_exists(service).await {
existing.push(service.clone());
}
}
existing
}
async fn service_exists(service: &str) -> bool {
let unit = if service.ends_with(".service") {
service.to_string()
} else {
format!("{}.service", service)
};
match Command::new("systemctl")
.args(["status", &unit])
.stdout(Stdio::null())
.stderr(Stdio::null())
.output()
.await
{
Ok(output) => output.status.success(),
Err(error) => {
warn!("Failed to check service {}: {}", unit, error);
false
}
}
}
/// Auto-detect backup configuration
pub async fn discover_backup_config(hostname: &str) -> (bool, Option<String>, String) {
// Check if this host should have backup monitoring
let backup_enabled = hostname == "srv01" || Self::has_backup_service().await;
// Try to find restic repository
let restic_repo = if backup_enabled {
Self::discover_restic_repo().await
} else {
None
};
// Determine backup service name
let backup_service = Self::discover_backup_service()
.await
.unwrap_or_else(|| "restic-backup".to_string());
(backup_enabled, restic_repo, backup_service)
}
async fn has_backup_service() -> bool {
// Check for common backup services
let backup_services = ["restic", "borg", "duplicati", "rclone"];
for service in backup_services {
if let Ok(output) = Command::new("systemctl")
.args(["is-enabled", service])
.output()
.await
{
if output.status.success() {
return true;
}
}
}
false
}
async fn discover_restic_repo() -> Option<String> {
// Common restic repository locations
let common_paths = [
"/srv/backups/restic",
"/var/backups/restic",
"/home/restic",
"/backup/restic",
"/mnt/backup/restic",
];
for path in common_paths {
if fs::metadata(path).await.is_ok() {
debug!("Found restic repository at: {}", path);
return Some(path.to_string());
}
}
// Try to find via environment variables or config files
if let Ok(content) = fs::read_to_string("/etc/restic/repository").await {
let repo_path = content.trim();
if !repo_path.is_empty() {
return Some(repo_path.to_string());
}
}
None
}
async fn discover_backup_service() -> Option<String> {
let backup_services = ["restic-backup", "restic", "borg-backup", "borg", "backup"];
for service in backup_services {
if let Ok(output) = Command::new("systemctl")
.args(["is-enabled", &format!("{}.service", service)])
.output()
.await
{
if output.status.success() {
return Some(service.to_string());
}
}
}
None
}
/// Validate auto-detected configuration
pub async fn validate_devices(devices: &[String]) -> Vec<String> {
let mut valid_devices = Vec::new();
for device in devices {
if Self::can_access_device(device).await {
valid_devices.push(device.clone());
} else {
warn!("Cannot access device {}, skipping", device);
}
}
valid_devices
}
async fn can_access_device(device: &str) -> bool {
let device_path = format!("/dev/{}", device);
// Try to run smartctl to see if device is accessible
if let Ok(output) = Command::new("smartctl")
.args(["-i", &device_path])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
{
// smartctl returns 0 for success, but may return other codes for warnings
// that are still acceptable (like device supports SMART but has some issues)
output.status.code().map_or(false, |code| code <= 4)
} else {
false
}
}
}

View File

@ -1,161 +1,182 @@
use std::thread;
use std::time::Duration;
use anyhow::{anyhow, Context, Result};
use chrono::Utc;
use anyhow::{anyhow, Result};
use clap::{ArgAction, Parser};
use cm_dashboard_shared::envelope::{AgentType, MetricsEnvelope};
use rand::Rng;
use serde_json::json;
use tracing::info;
use std::path::PathBuf;
use tokio::signal;
use tracing::{error, info};
use tracing_subscriber::EnvFilter;
use zmq::{Context as ZmqContext, SocketType};
mod agent;
mod collectors;
mod config;
mod discovery;
mod scheduler;
use agent::MetricsAgent;
use config::AgentConfig;
#[derive(Parser, Debug)]
#[command(
name = "cm-dashboard-agent",
version,
about = "CM Dashboard metrics agent"
about = "CM Dashboard ZMQ metrics agent with auto-detection"
)]
struct Cli {
/// Hostname to advertise in metric envelopes
#[arg(long, value_name = "HOSTNAME")]
hostname: String,
/// ZMQ port to bind to (default: 6130)
#[arg(long, value_name = "PORT")]
port: Option<u16>,
/// Bind endpoint for PUB socket (default tcp://*:6130)
#[arg(long, default_value = "tcp://*:6130", value_name = "ENDPOINT")]
bind: String,
/// Path to load configuration from
#[arg(long, value_name = "FILE")]
config: Option<PathBuf>,
/// Publish interval in milliseconds
#[arg(long, default_value_t = 5000)]
interval_ms: u64,
/// Optional path to persist the resolved configuration
#[arg(long, value_name = "FILE")]
write_config: Option<PathBuf>,
/// Disable smart metrics publisher
/// Disable SMART metrics collector
#[arg(long, action = ArgAction::SetTrue)]
disable_smart: bool,
/// Disable service metrics publisher
/// Disable service metrics collector
#[arg(long, action = ArgAction::SetTrue)]
disable_service: bool,
/// Disable backup metrics publisher
/// Disable backup metrics collector
#[arg(long, action = ArgAction::SetTrue)]
disable_backup: bool,
/// Skip auto-detection and use minimal defaults
#[arg(long, action = ArgAction::SetTrue)]
no_auto_detect: bool,
/// Show detected configuration and exit
#[arg(long, action = ArgAction::SetTrue)]
show_config: bool,
/// Increase logging verbosity (-v, -vv)
#[arg(short, long, action = ArgAction::Count)]
verbose: u8,
}
fn main() -> Result<()> {
#[tokio::main]
async fn main() -> Result<()> {
let cli = Cli::parse();
init_tracing(cli.verbose)?;
let context = ZmqContext::new();
let socket = context
.socket(SocketType::PUB)
.context("failed to create ZMQ PUB socket")?;
socket
.bind(&cli.bind)
.with_context(|| format!("failed to bind to {}", cli.bind))?;
info!(endpoint = %cli.bind, host = %cli.hostname, "agent started");
// Start with file-based configuration if requested, otherwise defaults
let mut config = if let Some(path) = cli.config.as_ref() {
AgentConfig::load_from_file(path)
.await
.map_err(|e| anyhow!("Failed to load config from {}: {}", path.display(), e))?
} else {
AgentConfig::default()
};
let interval = Duration::from_millis(cli.interval_ms.max(100));
let mut rng = rand::thread_rng();
// Hostname is auto-detected in AgentConfig::default()
loop {
let now = Utc::now();
let timestamp = now.timestamp() as u64;
let timestamp_rfc3339 = now.to_rfc3339();
if !cli.disable_smart {
let envelope = MetricsEnvelope {
hostname: cli.hostname.clone(),
agent_type: AgentType::Smart,
timestamp,
metrics: json!({
"status": "Healthy",
"drives": [{
"name": "nvme0n1",
"temperature_c": rng.gen_range(30.0..60.0),
"wear_level": rng.gen_range(1.0..10.0),
"power_on_hours": rng.gen_range(1000..20000),
"available_spare": rng.gen_range(90.0..100.0)
}],
"summary": {
"healthy": 1,
"warning": 0,
"critical": 0,
"capacity_total_gb": 1024,
"capacity_used_gb": rng.gen_range(100.0..800.0)
},
"issues": [],
"timestamp": timestamp_rfc3339
}),
};
publish(&socket, &envelope)?;
}
if !cli.disable_service {
let envelope = MetricsEnvelope {
hostname: cli.hostname.clone(),
agent_type: AgentType::Service,
timestamp,
metrics: json!({
"summary": {
"healthy": 5,
"degraded": 0,
"failed": 0,
"memory_used_mb": rng.gen_range(512.0..2048.0),
"memory_quota_mb": 4096.0
},
"services": [
{
"name": "example",
"status": "Running",
"memory_used_mb": rng.gen_range(128.0..512.0),
"memory_quota_mb": 1024.0,
"cpu_percent": rng.gen_range(0.0..75.0),
"sandbox_limit": null
}
],
"timestamp": timestamp_rfc3339
}),
};
publish(&socket, &envelope)?;
}
if !cli.disable_backup {
let envelope = MetricsEnvelope {
hostname: cli.hostname.clone(),
agent_type: AgentType::Backup,
timestamp,
metrics: json!({
"overall_status": "Healthy",
"backup": {
"last_success": timestamp_rfc3339,
"last_failure": null,
"size_gb": rng.gen_range(100.0..500.0),
"snapshot_count": rng.gen_range(10..40)
},
"service": {
"enabled": true,
"pending_jobs": 0,
"last_message": "Backups up-to-date"
},
"timestamp": timestamp_rfc3339
}),
};
publish(&socket, &envelope)?;
}
thread::sleep(interval);
// Apply CLI port override
if let Some(port) = cli.port {
config.zmq.port = port;
}
// Run auto-detection unless disabled
if !cli.no_auto_detect {
info!("Auto-detecting system configuration...");
config
.auto_configure()
.await
.map_err(|e| anyhow!("Auto-detection failed: {}", e))?;
} else {
info!("Skipping auto-detection, using minimal defaults");
}
// Apply CLI collector overrides after auto-detection
if cli.disable_smart {
config.collectors.smart.enabled = false;
}
if cli.disable_service {
config.collectors.service.enabled = false;
}
if cli.disable_backup {
config.collectors.backup.enabled = false;
}
if let Some(path) = cli.write_config.as_ref() {
config
.save_to_file(path)
.await
.map_err(|e| anyhow!("Failed to write config to {}: {}", path.display(), e))?;
info!("Persisted configuration to {}", path.display());
}
// Show configuration and exit if requested
if cli.show_config {
println!("Agent Configuration:");
println!(" Hostname: {}", config.agent.hostname);
println!(" ZMQ Port: {}", config.zmq.port);
println!(" Collectors: {}", config.summary());
if config.collectors.smart.enabled {
println!(" SMART Devices: {:?}", config.collectors.smart.devices);
}
if config.collectors.service.enabled {
println!(" Services: {:?}", config.collectors.service.services);
}
if config.collectors.backup.enabled {
println!(" Backup Repo: {:?}", config.collectors.backup.restic_repo);
println!(
" Backup Service: {}",
config.collectors.backup.backup_service
);
}
return Ok(());
}
info!(
"Starting agent for host '{}' on port {} with: {}",
config.agent.hostname,
config.zmq.port,
config.summary()
);
// Build and start the agent
let mut agent =
MetricsAgent::from_config(config).map_err(|e| anyhow!("Failed to create agent: {}", e))?;
// Set up graceful shutdown handling
let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel();
tokio::spawn(async move {
let mut sigterm = signal::unix::signal(signal::unix::SignalKind::terminate())
.expect("Failed to install SIGTERM handler");
let mut sigint = signal::unix::signal(signal::unix::SignalKind::interrupt())
.expect("Failed to install SIGINT handler");
tokio::select! {
_ = sigterm.recv() => info!("Received SIGTERM"),
_ = sigint.recv() => info!("Received SIGINT"),
}
let _ = shutdown_tx.send(());
});
// Run the agent until shutdown
tokio::select! {
result = agent.run() => {
match result {
Ok(_) => info!("Agent completed successfully"),
Err(e) => error!("Agent error: {}", e),
}
}
_ = shutdown_rx => {
info!("Shutdown signal received");
agent.shutdown().await;
}
}
}
fn publish(socket: &zmq::Socket, envelope: &MetricsEnvelope) -> Result<()> {
let serialized = serde_json::to_vec(envelope)?;
socket.send(serialized, 0)?;
Ok(())
}

393
agent/src/scheduler.rs Normal file
View File

@ -0,0 +1,393 @@
use futures::stream::{FuturesUnordered, StreamExt};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{mpsc, RwLock};
use tokio::time::{interval, Instant};
use tracing::{debug, error, info, warn};
use crate::collectors::{Collector, CollectorError, CollectorOutput};
pub struct CollectorScheduler {
collectors: Vec<Arc<dyn Collector>>,
sender: mpsc::UnboundedSender<SchedulerEvent>,
receiver: mpsc::UnboundedReceiver<SchedulerEvent>,
stats: Arc<RwLock<SchedulerStats>>,
metrics_sender: Option<mpsc::UnboundedSender<CollectorOutput>>,
}
#[derive(Debug)]
pub enum SchedulerEvent {
CollectionResult {
collector_name: String,
result: Result<CollectorOutput, CollectorError>,
duration: Duration,
},
Shutdown,
}
#[derive(Debug, Default, Clone)]
pub struct SchedulerStats {
pub total_collections: u64,
pub successful_collections: u64,
pub failed_collections: u64,
pub collector_stats: HashMap<String, CollectorStats>,
}
#[derive(Debug, Default, Clone)]
pub struct CollectorStats {
pub total_collections: u64,
pub successful_collections: u64,
pub failed_collections: u64,
pub last_success: Option<Instant>,
pub last_failure: Option<Instant>,
pub average_duration_ms: f64,
pub consecutive_failures: u32,
}
impl CollectorScheduler {
pub fn new() -> Self {
let (sender, receiver) = mpsc::unbounded_channel();
Self {
collectors: Vec::new(),
sender,
receiver,
stats: Arc::new(RwLock::new(SchedulerStats::default())),
metrics_sender: None,
}
}
pub fn set_metrics_sender(&mut self, sender: mpsc::UnboundedSender<CollectorOutput>) {
self.metrics_sender = Some(sender);
}
pub fn clear_metrics_sender(&mut self) {
self.metrics_sender = None;
}
pub fn add_collector(&mut self, collector: Arc<dyn Collector>) {
if collector.is_enabled() {
info!(
"Adding collector '{}' [{}] with interval {:?}",
collector.name(),
collector.agent_type().as_str(),
collector.collect_interval()
);
if collector.requires_root() {
debug!("Collector '{}' is flagged as root-only", collector.name());
}
self.collectors.push(collector);
} else {
info!("Skipping disabled collector '{}'", collector.name());
}
}
pub async fn start(&mut self) -> Result<(), CollectorError> {
if self.collectors.is_empty() {
return Err(CollectorError::ConfigError {
message: "No enabled collectors configured".to_string(),
});
}
info!(
"Starting scheduler with {} collectors",
self.collectors.len()
);
// Start collection tasks for each collector
let mut collection_tasks = FuturesUnordered::new();
for collector in self.collectors.clone() {
let sender = self.sender.clone();
let stats = self.stats.clone();
let task =
tokio::spawn(async move { Self::run_collector(collector, sender, stats).await });
collection_tasks.push(task);
}
// Main event loop
loop {
tokio::select! {
// Handle collection results
Some(event) = self.receiver.recv() => {
match event {
SchedulerEvent::CollectionResult { collector_name, result, duration } => {
self.handle_collection_result(&collector_name, result, duration).await;
}
SchedulerEvent::Shutdown => {
info!("Scheduler shutdown requested");
break;
}
}
}
// Handle task completion (shouldn't happen in normal operation)
Some(result) = collection_tasks.next() => {
match result {
Ok(_) => warn!("Collection task completed unexpectedly"),
Err(e) => error!("Collection task failed: {}", e),
}
}
// If all tasks are done and no more events, break
else => {
warn!("All collection tasks completed, shutting down scheduler");
break;
}
}
}
Ok(())
}
async fn run_collector(
collector: Arc<dyn Collector>,
sender: mpsc::UnboundedSender<SchedulerEvent>,
_stats: Arc<RwLock<SchedulerStats>>,
) {
let collector_name = collector.name().to_string();
let mut interval_timer = interval(collector.collect_interval());
interval_timer.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
info!("Starting collection loop for '{}'", collector_name);
loop {
interval_timer.tick().await;
debug!("Running collection for '{}'", collector_name);
let start_time = Instant::now();
match collector.collect().await {
Ok(output) => {
let duration = start_time.elapsed();
debug!(
"Collection '{}' completed in {:?}",
collector_name, duration
);
if let Err(e) = sender.send(SchedulerEvent::CollectionResult {
collector_name: collector_name.clone(),
result: Ok(output),
duration,
}) {
error!(
"Failed to send collection result for '{}': {}",
collector_name, e
);
break;
}
}
Err(error) => {
let duration = start_time.elapsed();
warn!(
"Collection '{}' failed after {:?}: {}",
collector_name, duration, error
);
if let Err(e) = sender.send(SchedulerEvent::CollectionResult {
collector_name: collector_name.clone(),
result: Err(error),
duration,
}) {
error!(
"Failed to send collection error for '{}': {}",
collector_name, e
);
break;
}
}
}
}
warn!("Collection loop for '{}' ended", collector_name);
}
async fn handle_collection_result(
&self,
collector_name: &str,
result: Result<CollectorOutput, CollectorError>,
duration: Duration,
) {
let publish_output = match &result {
Ok(output) => Some(output.clone()),
Err(_) => None,
};
{
let mut stats = self.stats.write().await;
stats.total_collections += 1;
match &result {
Ok(_) => {
stats.successful_collections += 1;
}
Err(_) => {
stats.failed_collections += 1;
}
}
}
// Handle collector-specific stats
{
let mut stats = self.stats.write().await;
let duration_ms = duration.as_millis() as f64;
let collector_stats = stats
.collector_stats
.entry(collector_name.to_string())
.or_default();
collector_stats.total_collections += 1;
if collector_stats.average_duration_ms == 0.0 {
collector_stats.average_duration_ms = duration_ms;
} else {
// Simple moving average
collector_stats.average_duration_ms =
(collector_stats.average_duration_ms * 0.9) + (duration_ms * 0.1);
}
match &result {
Ok(output) => {
collector_stats.successful_collections += 1;
collector_stats.last_success = Some(Instant::now());
collector_stats.consecutive_failures = 0;
let metrics_count = match &output.data {
serde_json::Value::Object(map) => map.len(),
serde_json::Value::Array(values) => values.len(),
_ => 1,
};
debug!(
"Collector '{}' [{}] successful at {} ({} metrics)",
collector_name,
output.agent_type.as_str(),
output.timestamp,
metrics_count
);
}
Err(error) => {
collector_stats.failed_collections += 1;
collector_stats.last_failure = Some(Instant::now());
collector_stats.consecutive_failures += 1;
warn!("Collection '{}' failed: {}", collector_name, error);
// Log warning for consecutive failures
if collector_stats.consecutive_failures >= 5 {
error!(
"Collector '{}' has {} consecutive failures",
collector_name, collector_stats.consecutive_failures
);
}
}
}
}
if let (Some(sender), Some(output)) = (&self.metrics_sender, publish_output) {
if let Err(error) = sender.send(output) {
warn!("Metrics channel send error: {}", error);
}
}
}
pub fn get_stats_handle(&self) -> Arc<RwLock<SchedulerStats>> {
self.stats.clone()
}
pub async fn shutdown(&self) {
info!("Requesting scheduler shutdown");
if let Err(e) = self.sender.send(SchedulerEvent::Shutdown) {
error!("Failed to send shutdown event: {}", e);
}
}
}
impl Default for CollectorScheduler {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct HealthChecker {
stats: Arc<RwLock<SchedulerStats>>,
max_consecutive_failures: u32,
max_failure_rate: f64,
}
impl HealthChecker {
pub fn new(stats: Arc<RwLock<SchedulerStats>>) -> Self {
Self {
stats,
max_consecutive_failures: 10,
max_failure_rate: 0.5, // 50% failure rate threshold
}
}
pub async fn check_health(&self) -> HealthStatus {
let stats = self.stats.read().await;
let mut unhealthy_collectors = Vec::new();
let mut degraded_collectors = Vec::new();
for (name, collector_stats) in &stats.collector_stats {
// Check consecutive failures
if collector_stats.consecutive_failures >= self.max_consecutive_failures {
unhealthy_collectors.push(name.clone());
continue;
}
// Check failure rate
if collector_stats.total_collections > 10 {
let failure_rate = collector_stats.failed_collections as f64
/ collector_stats.total_collections as f64;
if failure_rate >= self.max_failure_rate {
degraded_collectors.push(name.clone());
}
}
// Check if collector hasn't succeeded recently
if let Some(last_success) = collector_stats.last_success {
if last_success.elapsed() > Duration::from_secs(300) {
// 5 minutes
degraded_collectors.push(name.clone());
}
} else if collector_stats.total_collections > 5 {
// No successful collections after several attempts
unhealthy_collectors.push(name.clone());
}
}
if !unhealthy_collectors.is_empty() {
HealthStatus::Unhealthy {
unhealthy_collectors,
degraded_collectors,
}
} else if !degraded_collectors.is_empty() {
HealthStatus::Degraded {
degraded_collectors,
}
} else {
HealthStatus::Healthy
}
}
}
#[derive(Debug, Clone)]
pub enum HealthStatus {
Healthy,
Degraded {
degraded_collectors: Vec<String>,
},
Unhealthy {
unhealthy_collectors: Vec<String>,
degraded_collectors: Vec<String>,
},
}

73
config/agent.example.toml Normal file
View File

@ -0,0 +1,73 @@
# CM Dashboard Agent Configuration
# Example configuration file for the ZMQ metrics agent
[agent]
# Hostname to advertise in metrics (auto-detected if not specified)
hostname = "srv01"
# Log level: trace, debug, info, warn, error
log_level = "info"
# Maximum number of metrics to buffer before dropping
metrics_buffer_size = 1000
[zmq]
# ZMQ publisher port
port = 6130
# Bind address (0.0.0.0 for all interfaces, 127.0.0.1 for localhost only)
bind_address = "0.0.0.0"
# ZMQ socket timeouts in milliseconds
send_timeout_ms = 5000
receive_timeout_ms = 5000
[collectors.smart]
# Enable SMART metrics collection (disk health, temperature, wear)
enabled = true
# Collection interval in milliseconds (minimum 1000ms)
interval_ms = 5000
# List of storage devices to monitor (without /dev/ prefix)
devices = ["nvme0n1", "sda", "sdb"]
# Timeout for smartctl commands in milliseconds
timeout_ms = 30000
[collectors.service]
# Enable service metrics collection (systemd services)
enabled = true
# Collection interval in milliseconds (minimum 500ms)
interval_ms = 2000
# List of systemd services to monitor
services = [
"gitea",
"immich",
"vaultwarden",
"unifi",
"smart-metrics-api",
"service-metrics-api",
"backup-metrics-api"
]
# Timeout for systemctl commands in milliseconds
timeout_ms = 10000
[collectors.backup]
# Enable backup metrics collection (restic integration)
enabled = true
# Collection interval in milliseconds (minimum 5000ms)
interval_ms = 30000
# Restic repository path (leave empty to disable restic integration)
restic_repo = "/srv/backups/restic"
# Systemd service name for backup monitoring
backup_service = "restic-backup"
# Timeout for restic and backup commands in milliseconds
timeout_ms = 30000

View File

@ -18,3 +18,4 @@ tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] }
tracing-appender = "0.2"
zmq = "0.10"
gethostname = "0.4"

View File

@ -0,0 +1,37 @@
# CM Dashboard configuration
[hosts]
# default_host = "srv01"
[[hosts.hosts]]
name = "srv01"
enabled = true
# metadata = { rack = "R1" }
[[hosts.hosts]]
name = "labbox"
enabled = true
[dashboard]
tick_rate_ms = 250
history_duration_minutes = 60
[[dashboard.widgets]]
id = "nvme"
enabled = true
[[dashboard.widgets]]
id = "services"
enabled = true
[[dashboard.widgets]]
id = "backup"
enabled = true
[[dashboard.widgets]]
id = "alerts"
enabled = true
[filesystem]
# cache_dir = "/var/lib/cm-dashboard/cache"
# history_dir = "/var/lib/cm-dashboard/history"

View File

@ -0,0 +1,12 @@
# Optional separate hosts configuration
[hosts]
# default_host = "srv01"
[[hosts.hosts]]
name = "srv01"
enabled = true
[[hosts.hosts]]
name = "labbox"
enabled = true

View File

@ -5,6 +5,7 @@ use std::time::{Duration, Instant};
use anyhow::Result;
use chrono::{DateTime, Utc};
use crossterm::event::{KeyCode, KeyEvent, KeyEventKind};
use gethostname::gethostname;
use crate::config;
use crate::data::config::{AppConfig, DataSourceKind, HostTarget, ZmqConfig};
@ -100,8 +101,8 @@ impl App {
let host_count = self.hosts.len();
let retention = self.history.retention();
self.status = format!(
"Monitoring • hosts: {} • ticks: {} • refresh: {:?} • retention: {:?}",
host_count, self.tick_count, self.options.tick_rate, retention
"Monitoring • hosts: {} • refresh: {:?} • retention: {:?}",
host_count, self.options.tick_rate, retention
);
}
@ -321,49 +322,94 @@ impl App {
}
fn build_initial_status(host: Option<&String>, config_path: Option<&PathBuf>) -> String {
match (host, config_path) {
(Some(host), Some(path)) => {
let detected = Self::local_hostname();
match (host, config_path, detected.as_ref()) {
(Some(host), Some(path), _) => {
format!("Ready • host: {} • config: {}", host, path.display())
}
(Some(host), None) => format!("Ready • host: {}", host),
(None, Some(path)) => format!("Ready • config: {}", path.display()),
(None, None) => "Ready • no host selected".to_string(),
(Some(host), None, _) => format!("Ready • host: {}", host),
(None, Some(path), Some(local)) => format!(
"Ready • host: {} (auto) • config: {}",
local,
path.display()
),
(None, Some(path), None) => format!("Ready • config: {}", path.display()),
(None, None, Some(local)) => format!("Ready • host: {} (auto)", local),
(None, None, None) => "Ready • no host selected".to_string(),
}
}
fn select_hosts(host: Option<&String>, config: Option<&AppConfig>) -> Vec<HostTarget> {
let mut targets = Vec::new();
let Some(config) = config else {
return targets;
};
if let Some(filter) = host {
let normalized = filter.to_lowercase();
let host_filter = host.map(|value| value.to_lowercase());
for entry in &config.hosts.hosts {
if !entry.enabled {
continue;
}
if let Some(filter) = &host_filter {
if entry.name.to_lowercase() != *filter {
continue;
if let Some(config) = config {
if let Some(entry) = config.hosts.hosts.iter().find(|candidate| {
candidate.enabled && candidate.name.to_lowercase() == normalized
}) {
return vec![entry.clone()];
}
}
targets.push(entry.clone());
return vec![HostTarget::from_name(filter.clone())];
}
if targets.is_empty() {
if let Some(default_host) = &config.hosts.default_host {
if host_filter.is_none() {
if let Some(entry) = config.hosts.hosts.iter().find(|candidate| {
candidate.enabled && candidate.name.eq_ignore_ascii_case(default_host)
}) {
targets.push(entry.clone());
let local_host = Self::local_hostname();
if let Some(config) = config {
if let Some(local) = local_host.as_ref() {
if let Some(entry) = config.hosts.hosts.iter().find(|candidate| {
candidate.enabled && candidate.name.eq_ignore_ascii_case(local)
}) {
targets.push(entry.clone());
} else {
targets.push(HostTarget::from_name(local.clone()));
}
}
for entry in &config.hosts.hosts {
if !entry.enabled {
continue;
}
if targets
.iter()
.any(|existing| existing.name.eq_ignore_ascii_case(&entry.name))
{
continue;
}
targets.push(entry.clone());
}
if targets.len() <= 1 {
if let Some(default_host) = &config.hosts.default_host {
if !targets
.iter()
.any(|existing| existing.name.eq_ignore_ascii_case(default_host))
{
if let Some(entry) = config.hosts.hosts.iter().find(|candidate| {
candidate.enabled && candidate.name.eq_ignore_ascii_case(default_host)
}) {
targets.push(entry.clone());
}
}
}
}
if targets.is_empty() {
if let Some(local) = local_host {
targets.push(HostTarget::from_name(local));
}
}
} else if let Some(local) = local_host {
targets.push(HostTarget::from_name(local));
}
if targets.is_empty() {
targets.push(HostTarget::from_name("localhost".to_string()));
}
targets
@ -437,6 +483,18 @@ impl App {
}
}
impl App {
fn local_hostname() -> Option<String> {
let raw = gethostname();
let value = raw.to_string_lossy().trim().to_string();
if value.is_empty() {
None
} else {
Some(value)
}
}
}
#[derive(Debug, Clone)]
pub struct HostDisplayData {
pub name: String,

View File

@ -19,6 +19,8 @@ pub struct DriveInfo {
pub wear_level: f32,
pub power_on_hours: u64,
pub available_spare: f32,
pub capacity_gb: Option<f32>,
pub used_gb: Option<f32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -44,6 +46,28 @@ pub struct ServiceSummary {
pub failed: usize,
pub memory_used_mb: f32,
pub memory_quota_mb: f32,
#[serde(default)]
pub system_memory_used_mb: f32,
#[serde(default)]
pub system_memory_total_mb: f32,
#[serde(default)]
pub disk_used_gb: f32,
#[serde(default)]
pub disk_total_gb: f32,
#[serde(default)]
pub cpu_load_1: f32,
#[serde(default)]
pub cpu_load_5: f32,
#[serde(default)]
pub cpu_load_15: f32,
#[serde(default)]
pub cpu_freq_mhz: Option<f32>,
#[serde(default)]
pub cpu_temp_c: Option<f32>,
#[serde(default)]
pub gpu_load_percent: Option<f32>,
#[serde(default)]
pub gpu_temp_c: Option<f32>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -54,6 +78,8 @@ pub struct ServiceInfo {
pub memory_quota_mb: f32,
pub cpu_percent: f32,
pub sandbox_limit: Option<f32>,
#[serde(default)]
pub disk_used_gb: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@ -6,7 +6,10 @@ mod ui;
use std::fs;
use std::io::{self, Stdout};
use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use std::sync::{
atomic::{AtomicBool, Ordering},
Arc, OnceLock,
};
use std::time::Duration;
use crate::data::metrics::{BackupMetrics, ServiceMetrics, SmartMetrics};
@ -100,8 +103,14 @@ async fn main() -> Result<()> {
let mut app = App::new(options)?;
let (event_tx, mut event_rx) = unbounded_channel();
let shutdown_flag = Arc::new(AtomicBool::new(false));
let zmq_task = if let Some(context) = app.zmq_context() {
Some(spawn_metrics_task(context, event_tx.clone()))
Some(spawn_metrics_task(
context,
event_tx.clone(),
shutdown_flag.clone(),
))
} else {
None
};
@ -109,9 +118,12 @@ async fn main() -> Result<()> {
let mut terminal = setup_terminal()?;
let result = run_app(&mut terminal, &mut app, &mut event_rx);
teardown_terminal(terminal)?;
shutdown_flag.store(true, Ordering::Relaxed);
let _ = event_tx.send(AppEvent::Shutdown);
if let Some(handle) = zmq_task {
handle.abort();
if let Err(join_error) = handle.await {
warn!(%join_error, "ZMQ metrics task ended unexpectedly");
}
}
result
}
@ -206,9 +218,13 @@ fn prepare_log_writer() -> Result<tracing_appender::non_blocking::NonBlocking> {
Ok(non_blocking)
}
fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender<AppEvent>) -> JoinHandle<()> {
fn spawn_metrics_task(
context: ZmqContext,
sender: UnboundedSender<AppEvent>,
shutdown: Arc<AtomicBool>,
) -> JoinHandle<()> {
tokio::spawn(async move {
match spawn_blocking(move || metrics_blocking_loop(context, sender)).await {
match spawn_blocking(move || metrics_blocking_loop(context, sender, shutdown)).await {
Ok(Ok(())) => {}
Ok(Err(error)) => warn!(%error, "ZMQ metrics worker exited with error"),
Err(join_error) => warn!(%join_error, "ZMQ metrics worker panicked"),
@ -216,12 +232,23 @@ fn spawn_metrics_task(context: ZmqContext, sender: UnboundedSender<AppEvent>) ->
})
}
fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>) -> Result<()> {
fn metrics_blocking_loop(
context: ZmqContext,
sender: UnboundedSender<AppEvent>,
shutdown: Arc<AtomicBool>,
) -> Result<()> {
let zmq_context = NativeZmqContext::new();
let socket = zmq_context
.socket(zmq::SUB)
.context("failed to create ZMQ SUB socket")?;
socket
.set_linger(0)
.context("failed to configure ZMQ linger")?;
socket
.set_rcvtimeo(1_000)
.context("failed to configure ZMQ receive timeout")?;
for endpoint in context.endpoints() {
debug!(%endpoint, "connecting to ZMQ endpoint");
socket
@ -239,7 +266,7 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>)
.context("failed to subscribe to all ZMQ topics")?;
}
loop {
while !shutdown.load(Ordering::Relaxed) {
match socket.recv_msg(0) {
Ok(message) => {
if let Err(error) = handle_zmq_message(&message, &sender) {
@ -247,11 +274,18 @@ fn metrics_blocking_loop(context: ZmqContext, sender: UnboundedSender<AppEvent>)
}
}
Err(error) => {
if error == zmq::Error::EAGAIN {
continue;
}
warn!(%error, "ZMQ receive error");
std::thread::sleep(Duration::from_secs(1));
std::thread::sleep(Duration::from_millis(250));
}
}
}
debug!("ZMQ metrics worker shutting down");
Ok(())
}
fn handle_zmq_message(
@ -442,7 +476,7 @@ tick_rate_ms = 250
history_duration_minutes = 60
[[dashboard.widgets]]
id = "nvme"
id = "storage"
enabled = true
[[dashboard.widgets]]

View File

@ -1,51 +1,299 @@
use ratatui::layout::Rect;
use chrono::{DateTime, Utc};
use ratatui::layout::{Constraint, Rect};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{Block, Borders, Paragraph, Wrap};
use ratatui::text::Span;
use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
use crate::ui::memory::{evaluate_performance, PerfSeverity};
pub fn render(frame: &mut Frame, hosts: &[HostDisplayData], area: Rect) {
let block = Block::default()
.title("Alerts")
.borders(Borders::ALL)
.style(Style::default().fg(Color::LightRed));
let (severity, ok_count, warn_count, fail_count) = classify_hosts(hosts);
let color = match severity {
AlertSeverity::Critical => Color::Red,
AlertSeverity::Warning => Color::Yellow,
AlertSeverity::Healthy => Color::Green,
AlertSeverity::Unknown => Color::LightCyan,
};
let mut lines = Vec::new();
let title = format!(
"Alerts • ok:{} warn:{} fail:{}",
ok_count, warn_count, fail_count
);
let block = Block::default()
.title(Span::styled(
title,
Style::default().fg(color).add_modifier(Modifier::BOLD),
))
.borders(Borders::ALL)
.border_style(Style::default().fg(color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
if hosts.is_empty() {
lines.push(Line::from("No hosts configured"));
} else {
for host in hosts {
if let Some(error) = &host.last_error {
lines.push(Line::from(vec![
Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)),
Span::raw(": "),
Span::styled(error, Style::default().fg(Color::Red)),
]));
continue;
}
frame.render_widget(
Paragraph::new("No hosts configured")
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
return;
}
if let Some(smart) = host.smart.as_ref() {
if let Some(issue) = smart.issues.first() {
lines.push(Line::from(vec![
Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)),
Span::raw(": "),
Span::styled(issue, Style::default().fg(Color::Yellow)),
]));
continue;
}
}
let header = Row::new(vec![
Cell::from("Host"),
Cell::from("Status"),
Cell::from("Timestamp"),
])
.style(
Style::default()
.fg(Color::White)
.add_modifier(Modifier::BOLD),
);
lines.push(Line::from(vec![
Span::styled(&host.name, Style::default().add_modifier(Modifier::BOLD)),
Span::raw(": OK"),
]));
let rows = hosts.iter().map(|host| {
let (status, severity, emphasize) = host_status(host);
let row_style = severity_style(severity);
let update = latest_timestamp(host)
.map(|ts| ts.format("%Y-%m-%d %H:%M:%S").to_string())
.unwrap_or_else(|| "".to_string());
let status_cell = if emphasize {
Cell::from(Span::styled(
status.clone(),
Style::default().add_modifier(Modifier::BOLD),
))
} else {
Cell::from(status.clone())
};
Row::new(vec![
Cell::from(host.name.clone()),
status_cell,
Cell::from(update),
])
.style(row_style)
});
let table = Table::new(rows)
.header(header)
.style(Style::default().fg(Color::White))
.widths(&[
Constraint::Percentage(20),
Constraint::Length(20),
Constraint::Min(24),
])
.column_spacing(2);
frame.render_widget(table, inner);
}
#[derive(Copy, Clone, Eq, PartialEq)]
enum AlertSeverity {
Healthy,
Warning,
Critical,
Unknown,
}
fn classify_hosts(hosts: &[HostDisplayData]) -> (AlertSeverity, usize, usize, usize) {
let mut ok = 0;
let mut warn = 0;
let mut fail = 0;
for host in hosts {
let severity = host_severity(host);
match severity {
AlertSeverity::Healthy => ok += 1,
AlertSeverity::Warning => warn += 1,
AlertSeverity::Critical => fail += 1,
AlertSeverity::Unknown => warn += 1,
}
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
let highest = if fail > 0 {
AlertSeverity::Critical
} else if warn > 0 {
AlertSeverity::Warning
} else if ok > 0 {
AlertSeverity::Healthy
} else {
AlertSeverity::Unknown
};
frame.render_widget(paragraph, area);
(highest, ok, warn, fail)
}
fn host_severity(host: &HostDisplayData) -> AlertSeverity {
if host.last_error.is_some() {
return AlertSeverity::Critical;
}
if let Some(smart) = host.smart.as_ref() {
if smart.summary.critical > 0 {
return AlertSeverity::Critical;
}
if smart.summary.warning > 0 || !smart.issues.is_empty() {
return AlertSeverity::Warning;
}
}
if let Some(services) = host.services.as_ref() {
if services.summary.failed > 0 {
return AlertSeverity::Critical;
}
if services.summary.degraded > 0 {
return AlertSeverity::Warning;
}
let (perf_severity, _) = evaluate_performance(&services.summary);
match perf_severity {
PerfSeverity::Critical => return AlertSeverity::Critical,
PerfSeverity::Warning => return AlertSeverity::Warning,
PerfSeverity::Ok => {}
}
}
if let Some(backup) = host.backup.as_ref() {
match backup.overall_status {
crate::data::metrics::BackupStatus::Failed => return AlertSeverity::Critical,
crate::data::metrics::BackupStatus::Warning => return AlertSeverity::Warning,
_ => {}
}
}
if host.smart.is_none() && host.services.is_none() && host.backup.is_none() {
AlertSeverity::Unknown
} else {
AlertSeverity::Healthy
}
}
fn host_status(host: &HostDisplayData) -> (String, AlertSeverity, bool) {
if let Some(error) = &host.last_error {
return (format!("error: {}", error), AlertSeverity::Critical, true);
}
if let Some(smart) = host.smart.as_ref() {
if smart.summary.critical > 0 {
return (
"critical: SMART critical".to_string(),
AlertSeverity::Critical,
true,
);
}
if let Some(issue) = smart.issues.first() {
return (format!("warning: {}", issue), AlertSeverity::Warning, true);
}
}
if let Some(services) = host.services.as_ref() {
if services.summary.failed > 0 {
return (
format!("critical: {} failed svc", services.summary.failed),
AlertSeverity::Critical,
true,
);
}
if services.summary.degraded > 0 {
return (
format!("warning: {} degraded svc", services.summary.degraded),
AlertSeverity::Warning,
true,
);
}
let (perf_severity, reason) = evaluate_performance(&services.summary);
if let Some(reason_text) = reason {
match perf_severity {
PerfSeverity::Critical => {
return (
format!("critical: {}", reason_text),
AlertSeverity::Critical,
true,
);
}
PerfSeverity::Warning => {
return (
format!("warning: {}", reason_text),
AlertSeverity::Warning,
true,
);
}
PerfSeverity::Ok => {}
}
}
}
if let Some(backup) = host.backup.as_ref() {
match backup.overall_status {
crate::data::metrics::BackupStatus::Failed => {
return (
"critical: backup failed".to_string(),
AlertSeverity::Critical,
true,
);
}
crate::data::metrics::BackupStatus::Warning => {
return (
"warning: backup warning".to_string(),
AlertSeverity::Warning,
true,
);
}
_ => {}
}
}
if host.smart.is_none() && host.services.is_none() && host.backup.is_none() {
let status = if host.last_success.is_none() {
"pending: awaiting metrics"
} else {
"pending: no recent data"
};
return (status.to_string(), AlertSeverity::Warning, false);
}
("ok".to_string(), AlertSeverity::Healthy, false)
}
fn severity_style(severity: AlertSeverity) -> Style {
match severity {
AlertSeverity::Critical => Style::default().fg(Color::Red),
AlertSeverity::Warning => Style::default().fg(Color::Yellow),
AlertSeverity::Healthy => Style::default().fg(Color::White),
AlertSeverity::Unknown => Style::default().fg(Color::LightCyan),
}
}
fn latest_timestamp(host: &HostDisplayData) -> Option<DateTime<Utc>> {
let mut latest = host.last_success;
if let Some(smart) = host.smart.as_ref() {
latest = Some(match latest {
Some(current) => current.max(smart.timestamp),
None => smart.timestamp,
});
}
if let Some(services) = host.services.as_ref() {
latest = Some(match latest {
Some(current) => current.max(services.timestamp),
None => services.timestamp,
});
}
if let Some(backup) = host.backup.as_ref() {
latest = Some(match latest {
Some(current) => current.max(backup.timestamp),
None => backup.timestamp,
});
}
latest
}

View File

@ -1,62 +1,166 @@
use ratatui::layout::Rect;
use ratatui::layout::{Constraint, Direction, Layout, Rect};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{Block, Borders, Paragraph, Wrap};
use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
use crate::data::metrics::{BackupMetrics, BackupStatus};
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
let block = Block::default()
.title("Backups")
.borders(Borders::ALL)
.style(Style::default().fg(Color::LightGreen));
let mut lines = Vec::new();
match host {
Some(data) => {
if let Some(metrics) = data.backup.as_ref() {
lines.push(Line::from(vec![
Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(data.name.clone()),
]));
lines.push(Line::from(format!("Status: {:?}", metrics.overall_status)));
if let Some(last_success) = metrics.backup.last_success.as_ref() {
lines.push(Line::from(format!(
"Last success: {}",
last_success.format("%Y-%m-%d %H:%M:%S")
)));
}
if let Some(last_failure) = metrics.backup.last_failure.as_ref() {
lines.push(Line::from(vec![
Span::styled("Last failure: ", Style::default().fg(Color::Red)),
Span::raw(last_failure.format("%Y-%m-%d %H:%M:%S").to_string()),
]));
}
lines.push(Line::from(format!(
"Snapshots: {} • Size: {:.1} GiB",
metrics.backup.snapshot_count, metrics.backup.size_gb
)));
lines.push(Line::from(format!(
"Pending jobs: {} (enabled: {})",
metrics.service.pending_jobs, metrics.service.enabled
)));
render_metrics(frame, data, metrics, area);
} else {
lines.push(Line::from(format!(
"Host {} awaiting backup metrics",
data.name
)));
render_placeholder(
frame,
area,
&format!("Host {} awaiting backup metrics", data.name),
);
}
}
None => lines.push(Line::from("No hosts configured")),
None => render_placeholder(frame, area, "No hosts configured"),
}
}
fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &BackupMetrics, area: Rect) {
let color = backup_status_color(&metrics.overall_status);
let title = format!("Backups • status: {:?}", metrics.overall_status);
let block = Block::default()
.title(Span::styled(
title,
Style::default().fg(color).add_modifier(Modifier::BOLD),
))
.borders(Borders::ALL)
.border_style(Style::default().fg(color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
let chunks = Layout::default()
.direction(Direction::Vertical)
.constraints([Constraint::Length(2), Constraint::Min(1)])
.split(inner);
let summary_line = Line::from(vec![
Span::styled("Snapshots: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(metrics.backup.snapshot_count.to_string()),
Span::raw(" • Size: "),
Span::raw(format!("{:.1} GiB", metrics.backup.size_gb)),
Span::raw(" • Last success: "),
Span::raw(format_timestamp(metrics.backup.last_success.as_ref())),
]);
frame.render_widget(
Paragraph::new(summary_line)
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
chunks[0],
);
let header = Row::new(vec![Cell::from("Aspect"), Cell::from("Details")]).style(
Style::default()
.fg(Color::White)
.add_modifier(Modifier::BOLD),
);
let mut rows = Vec::new();
rows.push(
Row::new(vec![
Cell::from("Repo"),
Cell::from(format!(
"Snapshots: {} • Size: {:.1} GiB",
metrics.backup.snapshot_count, metrics.backup.size_gb
)),
])
.style(Style::default().fg(Color::White)),
);
rows.push(
Row::new(vec![
Cell::from("Service"),
Cell::from(format!(
"Enabled: {} • Pending jobs: {}",
metrics.service.enabled, metrics.service.pending_jobs
)),
])
.style(backup_severity_style(&metrics.overall_status)),
);
if let Some(last_failure) = metrics.backup.last_failure.as_ref() {
rows.push(
Row::new(vec![
Cell::from("Last failure"),
Cell::from(last_failure.format("%Y-%m-%d %H:%M:%S").to_string()),
])
.style(Style::default().fg(Color::Red)),
);
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
if let Some(message) = metrics.service.last_message.as_ref() {
let message_style = match metrics.overall_status {
BackupStatus::Failed => Style::default().fg(Color::Red),
BackupStatus::Warning => Style::default().fg(Color::Yellow),
_ => Style::default().fg(Color::White),
};
frame.render_widget(paragraph, area);
rows.push(
Row::new(vec![
Cell::from("Last message"),
Cell::from(message.clone()),
])
.style(message_style),
);
}
let table = Table::new(rows)
.header(header)
.style(Style::default().fg(Color::White))
.widths(&[Constraint::Length(13), Constraint::Min(10)])
.column_spacing(2);
frame.render_widget(table, chunks[1]);
}
fn backup_status_color(status: &BackupStatus) -> Color {
match status {
BackupStatus::Failed => Color::Red,
BackupStatus::Warning => Color::Yellow,
BackupStatus::Unknown => Color::LightYellow,
BackupStatus::Healthy => Color::Green,
}
}
fn format_timestamp(timestamp: Option<&chrono::DateTime<chrono::Utc>>) -> String {
timestamp
.map(|ts| ts.format("%Y-%m-%d %H:%M:%S").to_string())
.unwrap_or_else(|| "".to_string())
}
fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) {
let block = Block::default()
.title("Backups")
.borders(Borders::ALL)
.border_style(Style::default().fg(Color::LightGreen))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
frame.render_widget(
Paragraph::new(Line::from(message))
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
}
fn backup_severity_style(status: &BackupStatus) -> Style {
match status {
BackupStatus::Failed => Style::default().fg(Color::Red),
BackupStatus::Warning => Style::default().fg(Color::Yellow),
BackupStatus::Unknown => Style::default().fg(Color::LightCyan),
BackupStatus::Healthy => Style::default().fg(Color::White),
}
}

View File

@ -1,19 +1,25 @@
use ratatui::layout::{Constraint, Direction, Layout, Rect};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::Span;
use ratatui::widgets::Block;
use ratatui::widgets::{Block, Cell, Row, Table};
use ratatui::Frame;
use crate::app::App;
use super::{alerts, backup, memory, nvme, services};
use super::{alerts, backup, memory, storage, services};
pub fn render(frame: &mut Frame, app: &App) {
let host_summaries = app.host_display_data();
let primary_host = app.active_host_display();
let title = if let Some(host) = primary_host.as_ref() {
format!("CM Dashboard • {}", host.name)
} else {
"CM Dashboard".to_string()
};
let root_block = Block::default().title(Span::styled(
"CM Dashboard",
title,
Style::default()
.fg(Color::Cyan)
.add_modifier(Modifier::BOLD),
@ -48,7 +54,7 @@ pub fn render(frame: &mut Frame, app: &App) {
.constraints([Constraint::Percentage(50), Constraint::Percentage(50)])
.split(vertical_chunks[2]);
nvme::render(frame, primary_host.as_ref(), top[0]);
storage::render(frame, primary_host.as_ref(), top[0]);
services::render(frame, primary_host.as_ref(), top[1]);
memory::render(frame, primary_host.as_ref(), middle[0]);
backup::render(frame, primary_host.as_ref(), middle[1]);
@ -61,72 +67,125 @@ pub fn render(frame: &mut Frame, app: &App) {
}
fn render_status(frame: &mut Frame, app: &App, area: Rect) {
use ratatui::text::Line;
use ratatui::widgets::{Paragraph, Wrap};
let mut lines = Vec::new();
lines.push(Line::from(app.status_text().to_string()));
if app.zmq_connected() {
lines.push(Line::from(vec![
Span::styled(
"Data source: ",
Style::default().add_modifier(Modifier::BOLD),
),
Span::styled("ZMQ", Style::default().fg(Color::Green)),
]));
let connected = app.zmq_connected();
let title_color = if connected { Color::Green } else { Color::Red };
let title_suffix = if connected {
"connected"
} else {
lines.push(Line::from(vec![
Span::styled(
"Data source: ",
Style::default().add_modifier(Modifier::BOLD),
),
Span::styled("ZMQ (disconnected)", Style::default().fg(Color::Red)),
]));
}
"disconnected"
};
let block = Block::default()
.title(Span::styled(
format!("Status • ZMQ {title_suffix}"),
Style::default()
.fg(title_color)
.add_modifier(Modifier::BOLD),
))
.borders(ratatui::widgets::Borders::ALL)
.border_style(Style::default().fg(title_color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
let mut rows: Vec<Row> = Vec::new();
let status_style = if connected {
Style::default().fg(Color::White)
} else {
Style::default().fg(Color::Red)
};
let default_style = Style::default().fg(Color::White);
rows.push(
Row::new(vec![
Cell::from("Status"),
Cell::from(app.status_text().to_string()),
])
.style(status_style),
);
rows.push(
Row::new(vec![
Cell::from("Data source"),
Cell::from(if connected {
"ZMQ connected"
} else {
"ZMQ disconnected"
}),
])
.style(status_style),
);
if let Some((index, host)) = app.active_host_info() {
lines.push(Line::from(format!(
"Active host: {} ({}/{})",
host.name,
index + 1,
app.hosts().len()
)));
let mut detail = format!("{} ({}/{})", host.name, index + 1, app.hosts().len());
if let Some(state) = app
.host_display_data()
.into_iter()
.find(|entry| entry.name == host.name)
{
if let Some(last_success) = state.last_success {
detail = format!(
"{} • last success {}",
detail,
last_success.format("%H:%M:%S")
);
}
}
rows.push(
Row::new(vec![Cell::from("Active host"), Cell::from(detail)]).style(default_style),
);
} else {
lines.push(Line::from("Active host: —"));
rows.push(Row::new(vec![Cell::from("Active host"), Cell::from("")]).style(default_style));
}
if let Some(path) = app.active_config_path() {
lines.push(Line::from(vec![
Span::styled("Config: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(path.display().to_string()),
]));
rows.push(
Row::new(vec![
Cell::from("Config"),
Cell::from(path.display().to_string()),
])
.style(default_style),
);
}
let retention = app.history().retention();
lines.push(Line::from(format!(
"History retention ≈ {}s",
retention.as_secs()
)));
rows.push(
Row::new(vec![
Cell::from("History"),
Cell::from(format!("{} seconds", retention.as_secs())),
])
.style(default_style),
);
if let Some(config) = app.config() {
if let Some(default_host) = &config.hosts.default_host {
lines.push(Line::from(format!("Default host: {}", default_host)));
rows.push(
Row::new(vec![
Cell::from("Default host"),
Cell::from(default_host.clone()),
])
.style(default_style),
);
}
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(
Block::default()
.title(Span::styled(
"Status",
Style::default()
.fg(Color::Green)
.add_modifier(Modifier::BOLD),
))
.borders(ratatui::widgets::Borders::ALL),
rows.push(
Row::new(vec![
Cell::from("Monitored hosts"),
Cell::from(app.hosts().len().to_string()),
])
.style(default_style),
);
frame.render_widget(paragraph, area);
let table = Table::new(rows)
.widths(&[Constraint::Length(18), Constraint::Min(24)])
.column_spacing(2)
.style(default_style);
frame.render_widget(table, inner);
}
fn inner_rect(area: Rect) -> Rect {

View File

@ -5,52 +5,277 @@ use ratatui::widgets::{Block, Borders, Paragraph, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
use crate::data::metrics::{ServiceMetrics, ServiceSummary};
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
let block = Block::default()
.title("Memory Optimization")
.borders(Borders::ALL)
.style(Style::default().fg(Color::LightMagenta));
let mut lines = Vec::new();
match host {
Some(data) => {
if let Some(metrics) = data.services.as_ref() {
let summary = &metrics.summary;
let usage_ratio = if summary.memory_quota_mb > 0.0 {
(summary.memory_used_mb / summary.memory_quota_mb) * 100.0
} else {
0.0
};
lines.push(Line::from(vec![
Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(data.name.clone()),
]));
lines.push(Line::from(format!(
"Memory used: {:.1} / {:.1} MiB ({:.1}%)",
summary.memory_used_mb, summary.memory_quota_mb, usage_ratio
)));
if let Some(last_success) = data.last_success.as_ref() {
lines.push(Line::from(format!(
"Last update: {}",
last_success.format("%H:%M:%S")
)));
}
render_metrics(frame, data, metrics, area);
} else {
lines.push(Line::from(format!(
"Host {} awaiting service metrics",
data.name
)));
render_placeholder(
frame,
area,
&format!("Host {} awaiting service metrics", data.name),
);
}
}
None => lines.push(Line::from("No hosts configured")),
None => render_placeholder(frame, area, "No hosts configured"),
}
}
fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &ServiceMetrics, area: Rect) {
let summary = &metrics.summary;
let system_total = if summary.system_memory_total_mb > 0.0 {
summary.system_memory_total_mb
} else {
summary.memory_quota_mb
};
let system_used = if summary.system_memory_used_mb > 0.0 {
summary.system_memory_used_mb
} else {
summary.memory_used_mb
};
let usage_ratio = if system_total > 0.0 {
(system_used / system_total) * 100.0
} else {
0.0
};
let (perf_severity, _reason) = evaluate_performance(summary);
let (color, severity_label) = match perf_severity {
PerfSeverity::Critical => (Color::Red, "crit"),
PerfSeverity::Warning => (Color::Yellow, "warn"),
PerfSeverity::Ok => (Color::Green, "ok"),
};
let title = format!("CPU / Memory • {}", severity_label);
let block = Block::default()
.title(Span::styled(
title,
Style::default().fg(color).add_modifier(Modifier::BOLD),
))
.borders(Borders::ALL)
.border_style(Style::default().fg(color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
let mut lines = Vec::new();
// Check if memory should be highlighted due to alert
let memory_color = if usage_ratio >= 95.0 {
Color::Red // Critical
} else if usage_ratio >= 80.0 {
Color::Yellow // Warning
} else {
Color::White // Normal
};
lines.push(Line::from(vec![
Span::styled(
format!("System memory: {:.1} / {:.1} MiB ({:.1}%)",
system_used, system_total, usage_ratio),
Style::default().fg(memory_color)
)
]));
// Check if CPU load should be highlighted due to alert
let cpu_load_color = if summary.cpu_load_5 >= 4.0 {
Color::Red // Critical
} else if summary.cpu_load_5 >= 2.0 {
Color::Yellow // Warning
} else {
Color::White // Normal
};
lines.push(Line::from(vec![
Span::styled(
format!("CPU load (1/5/15): {:.2} {:.2} {:.2}",
summary.cpu_load_1, summary.cpu_load_5, summary.cpu_load_15),
Style::default().fg(cpu_load_color)
)
]));
lines.push(Line::from(vec![
Span::raw("CPU freq: "),
Span::raw(format_optional_metric(summary.cpu_freq_mhz, " MHz")),
]));
// Check if CPU temp should be highlighted due to alert
let cpu_temp_color = if let Some(temp) = summary.cpu_temp_c {
if temp >= 90.0 {
Color::Red // Critical
} else if temp >= 80.0 {
Color::Yellow // Warning
} else {
Color::White // Normal
}
} else {
Color::White // Normal
};
lines.push(Line::from(vec![
Span::raw("CPU temp: "),
Span::styled(
format_optional_metric(summary.cpu_temp_c, "°C"),
Style::default().fg(cpu_temp_color)
),
]));
if summary.gpu_load_percent.is_some() || summary.gpu_temp_c.is_some() {
// Check if GPU load should be highlighted due to alert
let gpu_load_color = if let Some(load) = summary.gpu_load_percent {
if load >= 95.0 {
Color::Red // Critical
} else if load >= 85.0 {
Color::Yellow // Warning
} else {
Color::White // Normal
}
} else {
Color::White // Normal
};
lines.push(Line::from(vec![
Span::styled("GPU load: ", Style::default().add_modifier(Modifier::BOLD)),
Span::styled(
format_optional_percent(summary.gpu_load_percent),
Style::default().fg(gpu_load_color)
),
]));
// Check if GPU temp should be highlighted due to alert
let gpu_temp_color = if let Some(temp) = summary.gpu_temp_c {
if temp >= 85.0 {
Color::Red // Critical
} else if temp >= 75.0 {
Color::Yellow // Warning
} else {
Color::White // Normal
}
} else {
Color::White // Normal
};
lines.push(Line::from(vec![
Span::styled("GPU temp: ", Style::default().add_modifier(Modifier::BOLD)),
Span::styled(
format_optional_metric(summary.gpu_temp_c, "°C"),
Style::default().fg(gpu_temp_color)
),
]));
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
frame.render_widget(paragraph, area);
frame.render_widget(
Paragraph::new(lines)
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) enum PerfSeverity {
Ok,
Warning,
Critical,
}
fn format_optional_metric(value: Option<f32>, unit: &str) -> String {
match value {
Some(number) => format!("{:.1}{}", number, unit),
None => "".to_string(),
}
}
fn format_optional_percent(value: Option<f32>) -> String {
match value {
Some(number) => format!("{:.0}%", number),
None => "".to_string(),
}
}
fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) {
let block = Block::default()
.title("CPU / Memory")
.borders(Borders::ALL)
.border_style(Style::default().fg(Color::LightMagenta))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
frame.render_widget(
Paragraph::new(Line::from(message))
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
}
pub(crate) fn evaluate_performance(summary: &ServiceSummary) -> (PerfSeverity, Option<String>) {
let mem_percent = if summary.system_memory_total_mb > 0.0 {
(summary.system_memory_used_mb / summary.system_memory_total_mb) * 100.0
} else if summary.memory_quota_mb > 0.0 {
(summary.memory_used_mb / summary.memory_quota_mb) * 100.0
} else {
0.0
};
let mut severity = PerfSeverity::Ok;
let mut reason: Option<String> = None;
let mut consider = |level: PerfSeverity, message: String| {
if level > severity {
severity = level;
reason = Some(message);
}
};
if mem_percent >= 95.0 {
consider(PerfSeverity::Critical, format!("RAM {:.0}%", mem_percent));
} else if mem_percent >= 80.0 {
consider(PerfSeverity::Warning, format!("RAM {:.0}%", mem_percent));
}
let load = summary.cpu_load_5;
if load >= 4.0 {
consider(PerfSeverity::Critical, format!("CPU load {:.2}", load));
} else if load >= 2.0 {
consider(PerfSeverity::Warning, format!("CPU load {:.2}", load));
}
if let Some(temp) = summary.cpu_temp_c {
if temp >= 90.0 {
consider(PerfSeverity::Critical, format!("CPU temp {:.0}°C", temp));
} else if temp >= 80.0 {
consider(PerfSeverity::Warning, format!("CPU temp {:.0}°C", temp));
}
}
if let Some(load) = summary.gpu_load_percent {
if load >= 95.0 {
consider(PerfSeverity::Critical, format!("GPU load {:.0}%", load));
} else if load >= 85.0 {
consider(PerfSeverity::Warning, format!("GPU load {:.0}%", load));
}
}
if let Some(temp) = summary.gpu_temp_c {
if temp >= 85.0 {
consider(PerfSeverity::Critical, format!("GPU temp {:.0}°C", temp));
} else if temp >= 75.0 {
consider(PerfSeverity::Warning, format!("GPU temp {:.0}°C", temp));
}
}
if severity == PerfSeverity::Ok {
(PerfSeverity::Ok, None)
} else {
(severity, reason)
}
}

View File

@ -2,7 +2,7 @@ pub mod alerts;
pub mod backup;
pub mod dashboard;
pub mod memory;
pub mod nvme;
pub mod storage;
pub mod services;
pub use dashboard::render;

View File

@ -1,58 +0,0 @@
use ratatui::layout::Rect;
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{Block, Borders, Paragraph, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
let block = Block::default()
.title("NVMe Health")
.borders(Borders::ALL)
.style(Style::default().fg(Color::LightCyan));
let mut lines = Vec::new();
match host {
Some(data) => {
if let Some(metrics) = data.smart.as_ref() {
lines.push(Line::from(vec![
Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(data.name.clone()),
]));
lines.push(Line::from(vec![
Span::styled("Status: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(metrics.status.clone()),
]));
lines.push(Line::from(format!(
"Drives healthy/warn/crit: {}/{}/{}",
metrics.summary.healthy, metrics.summary.warning, metrics.summary.critical
)));
lines.push(Line::from(format!(
"Capacity used: {:.1} / {:.1} GiB",
metrics.summary.capacity_used_gb, metrics.summary.capacity_total_gb
)));
if let Some(issue) = metrics.issues.first() {
lines.push(Line::from(vec![
Span::styled("Issue: ", Style::default().fg(Color::Yellow)),
Span::raw(issue.clone()),
]));
}
} else {
lines.push(Line::from(format!(
"Host {} has no SMART data yet",
data.name
)));
}
}
None => {
lines.push(Line::from("No hosts configured"));
}
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
frame.render_widget(paragraph, area);
}

View File

@ -1,54 +1,257 @@
use ratatui::layout::Rect;
use ratatui::layout::{Constraint, Direction, Layout, Rect};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{Block, Borders, Paragraph, Wrap};
use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
use crate::data::metrics::{ServiceStatus, ServiceSummary};
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
let block = Block::default()
.title("Services")
.borders(Borders::ALL)
.style(Style::default().fg(Color::Yellow));
let mut lines = Vec::new();
match host {
Some(data) => {
if let Some(metrics) = data.services.as_ref() {
let summary = &metrics.summary;
lines.push(Line::from(vec![
Span::styled("Host: ", Style::default().add_modifier(Modifier::BOLD)),
Span::raw(data.name.clone()),
]));
lines.push(Line::from(format!(
"Services healthy/degraded/failed: {}/{}/{}",
summary.healthy, summary.degraded, summary.failed
)));
lines.push(Line::from(format!(
"CPU top service: {:.1}%",
metrics
.services
.iter()
.map(|svc| svc.cpu_percent)
.fold(0.0_f32, f32::max)
)));
lines.push(Line::from(format!(
"Total memory: {:.1} / {:.1} MiB",
summary.memory_used_mb, summary.memory_quota_mb
)));
render_metrics(frame, data, metrics, area);
} else {
lines.push(Line::from(format!(
"Host {} has no service metrics yet",
data.name
)));
render_placeholder(
frame,
area,
&format!("Host {} has no service metrics yet", data.name),
);
}
}
None => lines.push(Line::from("No hosts configured")),
None => render_placeholder(frame, area, "No hosts configured"),
}
}
fn render_metrics(
frame: &mut Frame,
_host: &HostDisplayData,
metrics: &crate::data::metrics::ServiceMetrics,
area: Rect,
) {
let summary = &metrics.summary;
let color = summary_color(summary);
let disk_summary = format_disk_summary(summary.disk_used_gb, summary.disk_total_gb);
let title = format!(
"Services • ok:{} warn:{} fail:{} • Disk: {}",
summary.healthy, summary.degraded, summary.failed, disk_summary
);
let block = Block::default()
.title(Span::styled(
title,
Style::default().fg(color).add_modifier(Modifier::BOLD),
))
.borders(Borders::ALL)
.border_style(Style::default().fg(color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
let chunks = Layout::default()
.direction(Direction::Vertical)
.constraints([Constraint::Length(2), Constraint::Min(1)])
.split(inner);
let mut summary_lines = Vec::new();
summary_lines.push(Line::from(vec![
Span::styled(
"Service memory: ",
Style::default().add_modifier(Modifier::BOLD),
),
Span::raw(format_memory(summary)),
]));
let disk_text = if summary.disk_total_gb > 0.0 {
format!(
"{:.1} / {:.1} GiB",
summary.disk_used_gb, summary.disk_total_gb
)
} else {
"".to_string()
};
summary_lines.push(Line::from(vec![
Span::styled(
"Disk usage: ",
Style::default().add_modifier(Modifier::BOLD),
),
Span::raw(disk_text),
]));
summary_lines.push(Line::from(vec![
Span::styled(
"Services tracked: ",
Style::default().add_modifier(Modifier::BOLD),
),
Span::raw(metrics.services.len().to_string()),
]));
frame.render_widget(
Paragraph::new(summary_lines)
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
chunks[0],
);
if metrics.services.is_empty() {
frame.render_widget(
Paragraph::new("No services reported")
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
chunks[1],
);
return;
}
let paragraph = Paragraph::new(lines).wrap(Wrap { trim: true }).block(block);
let mut services = metrics.services.clone();
services.sort_by(|a, b| {
status_weight(&a.status)
.cmp(&status_weight(&b.status))
.then_with(|| a.name.cmp(&b.name))
});
frame.render_widget(paragraph, area);
let header = Row::new(vec![
Cell::from(""),
Cell::from("Service"),
Cell::from("Memory"),
Cell::from("Disk"),
])
.style(
Style::default()
.fg(Color::White)
.add_modifier(Modifier::BOLD),
);
let rows = services.into_iter().map(|svc| {
let row_style = status_style(&svc.status);
Row::new(vec![
Cell::from(status_symbol(&svc.status)),
Cell::from(format_service_name(&svc.name)),
Cell::from(format_memory_value(svc.memory_used_mb, svc.memory_quota_mb)),
Cell::from(format_disk_value(svc.disk_used_gb)),
])
.style(row_style)
});
let table = Table::new(rows)
.header(header)
.style(Style::default().fg(Color::White))
.widths(&[
Constraint::Length(1),
Constraint::Length(10),
Constraint::Length(12),
Constraint::Length(8),
])
.column_spacing(2);
frame.render_widget(table, chunks[1]);
}
fn status_weight(status: &ServiceStatus) -> i32 {
match status {
ServiceStatus::Stopped => 0,
ServiceStatus::Degraded => 1,
ServiceStatus::Restarting => 2,
ServiceStatus::Running => 3,
}
}
fn status_symbol(status: &ServiceStatus) -> &'static str {
match status {
ServiceStatus::Running => "",
ServiceStatus::Degraded => "!",
ServiceStatus::Restarting => "",
ServiceStatus::Stopped => "",
}
}
fn status_style(status: &ServiceStatus) -> Style {
match status {
ServiceStatus::Running => Style::default().fg(Color::White),
ServiceStatus::Degraded => Style::default().fg(Color::Yellow),
ServiceStatus::Restarting => Style::default().fg(Color::Yellow),
ServiceStatus::Stopped => Style::default().fg(Color::Red),
}
}
fn summary_color(summary: &ServiceSummary) -> Color {
if summary.failed > 0 {
Color::Red
} else if summary.degraded > 0 {
Color::Yellow
} else {
Color::Green
}
}
fn format_memory(summary: &ServiceSummary) -> String {
if summary.memory_quota_mb > 0.0 {
format!(
"{:.1}/{:.1} MiB",
summary.memory_used_mb, summary.memory_quota_mb
)
} else {
format!("{:.1} MiB used", summary.memory_used_mb)
}
}
fn format_memory_value(used: f32, quota: f32) -> String {
if quota > 0.05 {
format!("{:.1}/{:.1} MiB", used, quota)
} else if used > 0.05 {
format!("{:.1} MiB", used)
} else {
"".to_string()
}
}
fn format_disk_summary(used: f32, total: f32) -> String {
if total > 0.05 {
format!("{:.1}/{:.1} GiB", used, total)
} else if used > 0.05 {
format!("{:.1} GiB", used)
} else {
"".to_string()
}
}
fn format_disk_value(used: f32) -> String {
if used >= 1.0 {
format!("{:.1} GiB", used)
} else if used >= 0.001 { // 1 MB or more
format!("{:.0} MiB", used * 1024.0)
} else if used > 0.0 {
format!("<1 MiB")
} else {
"".to_string()
}
}
fn format_service_name(name: &str) -> String {
let mut truncated = String::with_capacity(10);
for ch in name.chars().take(10) {
truncated.push(ch);
}
truncated
}
fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) {
let block = Block::default()
.title("Services")
.borders(Borders::ALL)
.border_style(Style::default().fg(Color::Yellow))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
frame.render_widget(
Paragraph::new(Line::from(message))
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
}

196
dashboard/src/ui/storage.rs Normal file
View File

@ -0,0 +1,196 @@
use ratatui::layout::{Constraint, Direction, Layout, Rect};
use ratatui::style::{Color, Modifier, Style};
use ratatui::text::{Line, Span};
use ratatui::widgets::{Block, Borders, Cell, Paragraph, Row, Table, Wrap};
use ratatui::Frame;
use crate::app::HostDisplayData;
use crate::data::metrics::SmartMetrics;
pub fn render(frame: &mut Frame, host: Option<&HostDisplayData>, area: Rect) {
match host {
Some(data) => {
if let Some(metrics) = data.smart.as_ref() {
render_metrics(frame, data, metrics, area);
} else {
render_placeholder(
frame,
area,
&format!("Host {} has no SMART data yet", data.name),
);
}
}
None => render_placeholder(frame, area, "No hosts configured"),
}
}
fn render_metrics(frame: &mut Frame, _host: &HostDisplayData, metrics: &SmartMetrics, area: Rect) {
let color = smart_status_color(&metrics.status);
let title = format!(
"Storage • ok:{} warn:{} crit:{}",
metrics.summary.healthy, metrics.summary.warning, metrics.summary.critical
);
let block = Block::default()
.title(Span::styled(
title,
Style::default().fg(color).add_modifier(Modifier::BOLD),
))
.borders(Borders::ALL)
.border_style(Style::default().fg(color))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
let issue_count = metrics.issues.len();
let body_constraints = if issue_count > 0 {
vec![Constraint::Min(1), Constraint::Length(2)]
} else {
vec![Constraint::Min(1)]
};
let body_chunks = Layout::default()
.direction(Direction::Vertical)
.constraints(body_constraints)
.split(inner);
if metrics.drives.is_empty() {
frame.render_widget(
Paragraph::new("No drives reported")
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
body_chunks[0],
);
} else {
let header = Row::new(vec![
Cell::from("Drive"),
Cell::from("Temp"),
Cell::from("Wear"),
Cell::from("Spare"),
Cell::from("Hours"),
Cell::from("Capacity"),
Cell::from("Usage"),
])
.style(
Style::default()
.fg(Color::White)
.add_modifier(Modifier::BOLD),
);
let rows = metrics.drives.iter().map(|drive| {
Row::new(vec![
Cell::from(format_drive_name(&drive.name)),
Cell::from(format_temperature(drive.temperature_c)),
Cell::from(format_percent(drive.wear_level)),
Cell::from(format_percent(drive.available_spare)),
Cell::from(drive.power_on_hours.to_string()),
Cell::from(format_capacity(drive.capacity_gb)),
Cell::from(format_usage(drive.used_gb, drive.capacity_gb)),
])
});
let table = Table::new(rows)
.header(header)
.style(Style::default().fg(Color::White))
.widths(&[
Constraint::Length(10), // Drive name
Constraint::Length(8), // Temp
Constraint::Length(8), // Wear
Constraint::Length(8), // Spare
Constraint::Length(10), // Hours
Constraint::Length(10), // Capacity
Constraint::Min(8), // Usage
])
.column_spacing(2);
frame.render_widget(table, body_chunks[0]);
}
if issue_count > 0 {
let issue_line = Line::from(vec![
Span::styled("Issue: ", Style::default().fg(Color::Yellow)),
Span::styled(
metrics.issues[0].clone(),
Style::default().fg(Color::Yellow),
),
]);
frame.render_widget(
Paragraph::new(issue_line)
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
body_chunks[1],
);
}
}
fn smart_status_color(status: &str) -> Color {
match status.to_uppercase().as_str() {
"CRITICAL" => Color::Red,
"WARNING" => Color::Yellow,
_ => Color::Green,
}
}
fn format_temperature(value: f32) -> String {
if value.abs() < f32::EPSILON {
"".to_string()
} else {
format!("{:.0}°C", value)
}
}
fn format_percent(value: f32) -> String {
if value.abs() < f32::EPSILON {
"".to_string()
} else {
format!("{:.0}%", value)
}
}
fn format_drive_name(name: &str) -> String {
let mut truncated = String::with_capacity(10);
for ch in name.chars().take(10) {
truncated.push(ch);
}
truncated
}
fn format_capacity(value: Option<f32>) -> String {
match value {
Some(gb) if gb > 0.0 => format!("{:.0}G", gb),
_ => "".to_string(),
}
}
fn format_usage(used: Option<f32>, capacity: Option<f32>) -> String {
match (used, capacity) {
(Some(used_gb), Some(total_gb)) if used_gb > 0.0 && total_gb > 0.0 => {
let percent = (used_gb / total_gb) * 100.0;
format!("{:.0}G ({:.0}%)", used_gb, percent)
}
(Some(used_gb), None) if used_gb > 0.0 => {
format!("{:.0}G", used_gb)
}
_ => "".to_string(),
}
}
fn render_placeholder(frame: &mut Frame, area: Rect, message: &str) {
let block = Block::default()
.title("Storage")
.borders(Borders::ALL)
.border_style(Style::default().fg(Color::LightCyan))
.style(Style::default().fg(Color::White));
let inner = block.inner(area);
frame.render_widget(block, area);
frame.render_widget(
Paragraph::new(Line::from(message))
.wrap(Wrap { trim: true })
.style(Style::default().fg(Color::White)),
inner,
);
}

View File

@ -17,3 +17,6 @@ pub struct MetricsEnvelope {
#[serde(default)]
pub metrics: Value,
}
// Alias for backward compatibility
pub type MessageEnvelope = MetricsEnvelope;