From 37f2650200cfa5a3d5f25b07879d1df8118b7352 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 21:49:44 +0100 Subject: [PATCH 01/14] Document cached collector architecture plan Add architectural plan for separating ZMQ sending from data collection to prevent false 'host offline' alerts caused by slow collectors. Key concepts: - Shared cache (Arc>) - Independent async collector tasks with different update rates - ZMQ sender always sends every 1s from cache - Fast collectors (1s), medium (5s), slow (60s) - No blocking regardless of collector speed --- CLAUDE.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index f2619a8..3786ee8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,6 +156,56 @@ Complete migration from string-based metrics to structured JSON data. Eliminates - ✅ Backward compatibility via bridge conversion to existing UI widgets - ✅ All string parsing bugs eliminated +### Cached Collector Architecture (🚧 PLANNED) + +**Problem:** Blocking collectors prevent timely ZMQ transmission, causing false "host offline" alerts. + +**Previous (Sequential Blocking):** +``` +Every 1 second: + └─ collect_all_data() [BLOCKS for 2-10+ seconds] + ├─ CPU (fast: 10ms) + ├─ Memory (fast: 20ms) + ├─ Disk SMART (slow: 3s per drive × 4 drives = 12s) + ├─ Service disk usage (slow: 2-8s per service) + └─ Docker (medium: 500ms) + └─ send_via_zmq() [Only after ALL collection completes] + +Result: If any collector takes >10s → "host offline" false alert +``` + +**New (Cached Independent Collectors):** +``` +Shared Cache: Arc> + +Background Collectors (independent async tasks): +├─ Fast collectors (CPU, RAM, Network) +│ └─ Update cache every 1 second +├─ Medium collectors (Services, Docker) +│ └─ Update cache every 5 seconds +└─ Slow collectors (Disk usage, SMART data) + └─ Update cache every 60 seconds + +ZMQ Sender (separate async task): +Every 1 second: + └─ Read current cache + └─ Send via ZMQ [Always instant, never blocked] +``` + +**Benefits:** +- ✅ ZMQ sends every 1 second regardless of collector speed +- ✅ No false "host offline" alerts from slow collectors +- ✅ Different update rates for different metrics (CPU=1s, SMART=60s) +- ✅ System stays responsive even with slow operations +- ✅ Slow collectors can use longer timeouts without blocking + +**Implementation:** +- Shared `AgentData` cache wrapped in `Arc>` +- Each collector spawned as independent tokio task +- Collectors update their section of cache at their own rate +- ZMQ sender reads cache every 1s and transmits +- Stale data acceptable for slow-changing metrics (disk usage, SMART) + ### Maintenance Mode - Agent checks for `/tmp/cm-maintenance` file before sending notifications From 2740de9b54e5239acfe9d788f9e6a877f7274331 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 22:37:20 +0100 Subject: [PATCH 02/14] Implement cached collector architecture with configurable timeouts Major architectural refactor to eliminate false "host offline" alerts: - Replace sequential blocking collectors with independent async tasks - Each collector runs at configurable interval and updates shared cache - ZMQ sender reads cache every 1-2s regardless of collector speed - Collector intervals: CPU/Memory (1-10s), Backup/NixOS (30-60s), Disk/Systemd (60-300s) All intervals now configurable via NixOS config: - collectors.*.interval_seconds (collection frequency per collector) - collectors.*.command_timeout_seconds (timeout for shell commands) - notifications.check_interval_seconds (status change detection rate) Command timeouts increased from hardcoded 2-3s to configurable 10-30s: - Disk collector: 30s (SMART operations, lsblk) - Systemd collector: 15s (systemctl, docker, du commands) - Network collector: 10s (ip route, ip addr) Benefits: - No false "offline" alerts when slow collectors take >10s - Different update rates for different metric types - Better resource management with longer timeouts - Full NixOS configuration control Bump version to v0.1.193 --- CLAUDE.md | 44 +++++-- Cargo.lock | 6 +- agent/Cargo.toml | 2 +- agent/src/agent.rs | 210 +++++++++++++++++++++----------- agent/src/collectors/disk.rs | 2 +- agent/src/collectors/network.rs | 14 ++- agent/src/collectors/systemd.rs | 39 +++--- agent/src/config/mod.rs | 28 +++++ dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 10 files changed, 239 insertions(+), 110 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 3786ee8..32fe7aa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,7 +156,7 @@ Complete migration from string-based metrics to structured JSON data. Eliminates - ✅ Backward compatibility via bridge conversion to existing UI widgets - ✅ All string parsing bugs eliminated -### Cached Collector Architecture (🚧 PLANNED) +### Cached Collector Architecture (✅ IMPLEMENTED) **Problem:** Blocking collectors prevent timely ZMQ transmission, causing false "host offline" alerts. @@ -199,12 +199,42 @@ Every 1 second: - ✅ System stays responsive even with slow operations - ✅ Slow collectors can use longer timeouts without blocking -**Implementation:** -- Shared `AgentData` cache wrapped in `Arc>` -- Each collector spawned as independent tokio task -- Collectors update their section of cache at their own rate -- ZMQ sender reads cache every 1s and transmits -- Stale data acceptable for slow-changing metrics (disk usage, SMART) +**Implementation Details:** +- **Shared cache**: `Arc>` initialized at agent startup +- **Collector intervals**: Fully configurable via NixOS config (`interval_seconds` per collector) + - Recommended: Fast (1-10s): CPU, Memory, Network + - Recommended: Medium (30-60s): Backup, NixOS + - Recommended: Slow (60-300s): Disk, Systemd +- **Independent tasks**: Each collector spawned as separate tokio task in `Agent::new()` +- **Cache updates**: Collectors acquire write lock → update → release immediately +- **ZMQ sender**: Main loop reads cache every `collection_interval_seconds` and broadcasts +- **Notification check**: Runs every `notifications.check_interval_seconds` +- **Lock strategy**: Short-lived write locks prevent blocking, read locks for transmission +- **Stale data**: Acceptable for slow-changing metrics (SMART data, disk usage) + +**Configuration (NixOS):** +All intervals and timeouts configurable in `services/cm-dashboard.nix`: + +Collection Intervals: +- `collectors.cpu.interval_seconds` (default: 10s) +- `collectors.memory.interval_seconds` (default: 2s) +- `collectors.disk.interval_seconds` (default: 300s) +- `collectors.systemd.interval_seconds` (default: 10s) +- `collectors.backup.interval_seconds` (default: 60s) +- `collectors.network.interval_seconds` (default: 10s) +- `collectors.nixos.interval_seconds` (default: 60s) +- `notifications.check_interval_seconds` (default: 30s) +- `collection_interval_seconds` - ZMQ transmission rate (default: 2s) + +Command Timeouts (prevent resource leaks from hung commands): +- `collectors.disk.command_timeout_seconds` (default: 30s) - lsblk, smartctl, etc. +- `collectors.systemd.command_timeout_seconds` (default: 15s) - systemctl, docker, du +- `collectors.network.command_timeout_seconds` (default: 10s) - ip route, ip addr + +**Code Locations:** +- agent/src/agent.rs:59-133 - Collector task spawning +- agent/src/agent.rs:151-179 - Independent collector task runner +- agent/src/agent.rs:199-207 - ZMQ sender in main loop ### Maintenance Mode diff --git a/Cargo.lock b/Cargo.lock index fc8849f..2a78eb9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.191" +version = "0.1.192" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.191" +version = "0.1.192" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.191" +version = "0.1.192" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index c4f70ac..dcdf453 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.192" +version = "0.1.193" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index d74a4c7..900c7a2 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -1,13 +1,14 @@ use anyhow::Result; use gethostname::gethostname; +use std::sync::Arc; use std::time::Duration; +use tokio::sync::RwLock; use tokio::time::interval; use tracing::{debug, error, info}; use crate::communication::{AgentCommand, ZmqHandler}; use crate::config::AgentConfig; use crate::collectors::{ - Collector, backup::BackupCollector, cpu::CpuCollector, disk::DiskCollector, @@ -23,7 +24,7 @@ pub struct Agent { hostname: String, config: AgentConfig, zmq_handler: ZmqHandler, - collectors: Vec>, + cache: Arc>, notification_manager: NotificationManager, previous_status: Option, } @@ -55,39 +56,94 @@ impl Agent { config.zmq.publisher_port ); - // Initialize collectors - let mut collectors: Vec> = Vec::new(); - - // Add enabled collectors + // Initialize shared cache + let cache = Arc::new(RwLock::new(AgentData::new( + hostname.clone(), + env!("CARGO_PKG_VERSION").to_string() + ))); + info!("Initialized shared agent data cache"); + + // Spawn independent collector tasks + let mut collector_count = 0; + + // CPU collector if config.collectors.cpu.enabled { - collectors.push(Box::new(CpuCollector::new(config.collectors.cpu.clone()))); + let cache_clone = cache.clone(); + let collector = CpuCollector::new(config.collectors.cpu.clone()); + let interval = config.collectors.cpu.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "CPU").await; + }); + collector_count += 1; } - + + // Memory collector if config.collectors.memory.enabled { - collectors.push(Box::new(MemoryCollector::new(config.collectors.memory.clone()))); - } - - if config.collectors.disk.enabled { - collectors.push(Box::new(DiskCollector::new(config.collectors.disk.clone()))); - } - - if config.collectors.systemd.enabled { - collectors.push(Box::new(SystemdCollector::new(config.collectors.systemd.clone()))); - } - - if config.collectors.backup.enabled { - collectors.push(Box::new(BackupCollector::new())); + let cache_clone = cache.clone(); + let collector = MemoryCollector::new(config.collectors.memory.clone()); + let interval = config.collectors.memory.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Memory").await; + }); + collector_count += 1; } + // Network collector if config.collectors.network.enabled { - collectors.push(Box::new(NetworkCollector::new(config.collectors.network.clone()))); + let cache_clone = cache.clone(); + let collector = NetworkCollector::new(config.collectors.network.clone()); + let interval = config.collectors.network.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Network").await; + }); + collector_count += 1; } + // Backup collector + if config.collectors.backup.enabled { + let cache_clone = cache.clone(); + let collector = BackupCollector::new(); + let interval = config.collectors.backup.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Backup").await; + }); + collector_count += 1; + } + + // NixOS collector if config.collectors.nixos.enabled { - collectors.push(Box::new(NixOSCollector::new(config.collectors.nixos.clone()))); + let cache_clone = cache.clone(); + let collector = NixOSCollector::new(config.collectors.nixos.clone()); + let interval = config.collectors.nixos.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "NixOS").await; + }); + collector_count += 1; } - info!("Initialized {} collectors", collectors.len()); + // Disk collector + if config.collectors.disk.enabled { + let cache_clone = cache.clone(); + let collector = DiskCollector::new(config.collectors.disk.clone()); + let interval = config.collectors.disk.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Disk").await; + }); + collector_count += 1; + } + + // Systemd collector + if config.collectors.systemd.enabled { + let cache_clone = cache.clone(); + let collector = SystemdCollector::new(config.collectors.systemd.clone()); + let interval = config.collectors.systemd.interval_seconds; + tokio::spawn(async move { + Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Systemd").await; + }); + collector_count += 1; + } + + info!("Spawned {} independent collector tasks", collector_count); // Initialize notification manager let notification_manager = NotificationManager::new(&config.notifications, &hostname)?; @@ -97,45 +153,79 @@ impl Agent { hostname, config, zmq_handler, - collectors, + cache, notification_manager, previous_status: None, }) } - /// Main agent loop with structured data collection - pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> { - info!("Starting agent main loop"); + /// Independent collector task runner + async fn run_collector_task( + cache: Arc>, + collector: C, + interval_duration: Duration, + name: &str, + ) where + C: crate::collectors::Collector + Send + 'static, + { + let mut interval_timer = interval(interval_duration); + info!("{} collector task started (interval: {:?})", name, interval_duration); - // Initial collection - if let Err(e) = self.collect_and_broadcast().await { - error!("Initial metric collection failed: {}", e); + loop { + interval_timer.tick().await; + + // Acquire write lock and update cache + { + let mut agent_data = cache.write().await; + match collector.collect_structured(&mut *agent_data).await { + Ok(_) => { + debug!("{} collector updated cache", name); + } + Err(e) => { + error!("{} collector failed: {}", name, e); + } + } + } // Release lock immediately after collection } + } - // Set up intervals + /// Main agent loop with cached data architecture + pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> { + info!("Starting agent main loop with cached collector architecture"); + + // Set up intervals from config let mut transmission_interval = interval(Duration::from_secs( self.config.collection_interval_seconds, )); - let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s + let mut notification_interval = interval(Duration::from_secs( + self.config.notifications.check_interval_seconds, + )); + let mut command_interval = interval(Duration::from_millis(100)); - // Skip initial ticks to avoid immediate execution + // Skip initial ticks transmission_interval.tick().await; notification_interval.tick().await; + command_interval.tick().await; loop { tokio::select! { _ = transmission_interval.tick() => { - if let Err(e) = self.collect_and_broadcast().await { - error!("Failed to collect and broadcast metrics: {}", e); + // Read current cache state and broadcast via ZMQ + let agent_data = self.cache.read().await.clone(); + if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { + error!("Failed to broadcast agent data: {}", e); + } else { + debug!("Successfully broadcast agent data"); } } _ = notification_interval.tick() => { - // Process any pending notifications - // NOTE: With structured data, we might need to implement status tracking differently - // For now, we skip this until status evaluation is migrated + // Read cache and check for status changes + let agent_data = self.cache.read().await.clone(); + if let Err(e) = self.check_status_changes_and_notify(&agent_data).await { + error!("Failed to check status changes: {}", e); + } } - // Handle incoming commands (check periodically) - _ = tokio::time::sleep(Duration::from_millis(100)) => { + _ = command_interval.tick() => { if let Err(e) = self.handle_commands().await { error!("Error handling commands: {}", e); } @@ -151,35 +241,6 @@ impl Agent { Ok(()) } - /// Collect structured data from all collectors and broadcast via ZMQ - async fn collect_and_broadcast(&mut self) -> Result<()> { - debug!("Starting structured data collection"); - - // Initialize empty AgentData - let mut agent_data = AgentData::new(self.hostname.clone(), env!("CARGO_PKG_VERSION").to_string()); - - // Collect data from all collectors - for collector in &self.collectors { - if let Err(e) = collector.collect_structured(&mut agent_data).await { - error!("Collector failed: {}", e); - // Continue with other collectors even if one fails - } - } - - // Check for status changes and send notifications - if let Err(e) = self.check_status_changes_and_notify(&agent_data).await { - error!("Failed to check status changes: {}", e); - } - - // Broadcast the structured data via ZMQ - if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { - error!("Failed to broadcast agent data: {}", e); - } else { - debug!("Successfully broadcast structured agent data"); - } - - Ok(()) - } /// Check for status changes and send notifications async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> { @@ -267,9 +328,12 @@ impl Agent { match command { AgentCommand::CollectNow => { - info!("Received immediate collection request"); - if let Err(e) = self.collect_and_broadcast().await { - error!("Failed to collect on demand: {}", e); + info!("Received immediate transmission request"); + // With cached architecture, collectors run independently + // Just send current cache state immediately + let agent_data = self.cache.read().await.clone(); + if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { + error!("Failed to broadcast on demand: {}", e); } } AgentCommand::SetInterval { seconds } => { diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 71c53cf..588bec8 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -117,7 +117,7 @@ impl DiskCollector { let mut cmd = Command::new("lsblk"); cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); - let output = run_command_with_timeout(cmd, 2).await + let output = run_command_with_timeout(cmd, self.config.command_timeout_seconds).await .map_err(|e| CollectorError::SystemRead { path: "block devices".to_string(), error: e.to_string(), diff --git a/agent/src/collectors/network.rs b/agent/src/collectors/network.rs index fd4dbe2..5a26b05 100644 --- a/agent/src/collectors/network.rs +++ b/agent/src/collectors/network.rs @@ -8,12 +8,12 @@ use crate::config::NetworkConfig; /// Network interface collector with physical/virtual classification and link status pub struct NetworkCollector { - _config: NetworkConfig, + config: NetworkConfig, } impl NetworkCollector { pub fn new(config: NetworkConfig) -> Self { - Self { _config: config } + Self { config } } /// Check if interface is physical (not virtual) @@ -50,8 +50,9 @@ impl NetworkCollector { } /// Get the primary physical interface (the one with default route) - fn get_primary_physical_interface() -> Option { - match Command::new("timeout").args(["2", "ip", "route", "show", "default"]).output() { + fn get_primary_physical_interface(&self) -> Option { + let timeout_str = self.config.command_timeout_seconds.to_string(); + match Command::new("timeout").args([&timeout_str, "ip", "route", "show", "default"]).output() { Ok(output) if output.status.success() => { let output_str = String::from_utf8_lossy(&output.stdout); // Parse: "default via 192.168.1.1 dev eno1 ..." @@ -110,7 +111,8 @@ impl NetworkCollector { // Parse VLAN configuration let vlan_map = Self::parse_vlan_config(); - match Command::new("timeout").args(["2", "ip", "-j", "addr"]).output() { + let timeout_str = self.config.command_timeout_seconds.to_string(); + match Command::new("timeout").args([&timeout_str, "ip", "-j", "addr"]).output() { Ok(output) if output.status.success() => { let json_str = String::from_utf8_lossy(&output.stdout); @@ -195,7 +197,7 @@ impl NetworkCollector { } // Assign primary physical interface as parent to virtual interfaces without explicit parent - let primary_interface = Self::get_primary_physical_interface(); + let primary_interface = self.get_primary_physical_interface(); if let Some(primary) = primary_interface { for interface in interfaces.iter_mut() { // Only assign parent to virtual interfaces that don't already have one diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index bcaa6be..9ba8663 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -254,18 +254,19 @@ impl SystemdCollector { /// Auto-discover interesting services to monitor fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { - // First: Get all service unit files (with 3 second timeout) + // First: Get all service unit files + let timeout_str = self.config.command_timeout_seconds.to_string(); let unit_files_output = Command::new("timeout") - .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) + .args(&[&timeout_str, "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) .output()?; if !unit_files_output.status.success() { return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); } - // Second: Get runtime status of all units (with 3 second timeout) + // Second: Get runtime status of all units let units_status_output = Command::new("timeout") - .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) + .args(&[&timeout_str, "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) .output()?; if !units_status_output.status.success() { @@ -361,16 +362,17 @@ impl SystemdCollector { } } - // Fallback to systemctl if not in cache (with 2 second timeout) + // Fallback to systemctl if not in cache + let timeout_str = self.config.command_timeout_seconds.to_string(); let output = Command::new("timeout") - .args(&["2", "systemctl", "is-active", &format!("{}.service", service)]) + .args(&[&timeout_str, "systemctl", "is-active", &format!("{}.service", service)]) .output()?; let active_status = String::from_utf8(output.stdout)?.trim().to_string(); - // Get more detailed info (with 2 second timeout) + // Get more detailed info let output = Command::new("timeout") - .args(&["2", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) + .args(&[&timeout_str, "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) .output()?; let detailed_info = String::from_utf8(output.stdout)?; @@ -430,9 +432,10 @@ impl SystemdCollector { return Ok(0.0); } - // No configured path - try to get WorkingDirectory from systemctl (with 2 second timeout) + // No configured path - try to get WorkingDirectory from systemctl + let timeout_str = self.config.command_timeout_seconds.to_string(); let output = Command::new("timeout") - .args(&["2", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) + .args(&[&timeout_str, "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) .output() .map_err(|e| CollectorError::SystemRead { path: format!("WorkingDirectory for {}", service_name), @@ -452,15 +455,15 @@ impl SystemdCollector { Ok(0.0) } - /// Get size of a directory in GB (with 2 second timeout) + /// Get size of a directory in GB async fn get_directory_size(&self, path: &str) -> Option { use super::run_command_with_timeout; - // Use -s (summary) and --apparent-size for speed, 2 second timeout + // Use -s (summary) and --apparent-size for speed let mut cmd = Command::new("sudo"); cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]); - let output = run_command_with_timeout(cmd, 2).await.ok()?; + let output = run_command_with_timeout(cmd, self.config.command_timeout_seconds).await.ok()?; if !output.status.success() { // Log permission errors for debugging but don't spam logs @@ -786,9 +789,10 @@ impl SystemdCollector { let mut containers = Vec::new(); // Check if docker is available (cm-agent user is in docker group) - // Use -a to show ALL containers (running and stopped) with 3 second timeout + // Use -a to show ALL containers (running and stopped) + let timeout_str = self.config.command_timeout_seconds.to_string(); let output = Command::new("timeout") - .args(&["3", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) + .args(&[&timeout_str, "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) .output(); let output = match output { @@ -829,9 +833,10 @@ impl SystemdCollector { /// Get docker images as sub-services fn get_docker_images(&self) -> Vec<(String, String, f32)> { let mut images = Vec::new(); - // Check if docker is available (cm-agent user is in docker group) with 3 second timeout + // Check if docker is available (cm-agent user is in docker group) + let timeout_str = self.config.command_timeout_seconds.to_string(); let output = Command::new("timeout") - .args(&["3", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) + .args(&[&timeout_str, "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) .output(); let output = match output { diff --git a/agent/src/config/mod.rs b/agent/src/config/mod.rs index 8593b54..f0f403a 100644 --- a/agent/src/config/mod.rs +++ b/agent/src/config/mod.rs @@ -79,6 +79,9 @@ pub struct DiskConfig { pub temperature_critical_celsius: f32, pub wear_warning_percent: f32, pub wear_critical_percent: f32, + /// Command timeout in seconds for lsblk, smartctl, etc. + #[serde(default = "default_disk_command_timeout")] + pub command_timeout_seconds: u64, } /// Filesystem configuration entry @@ -108,6 +111,9 @@ pub struct SystemdConfig { pub http_timeout_seconds: u64, pub http_connect_timeout_seconds: u64, pub nginx_latency_critical_ms: f32, + /// Command timeout in seconds for systemctl, docker, du commands + #[serde(default = "default_systemd_command_timeout")] + pub command_timeout_seconds: u64, } @@ -132,6 +138,9 @@ pub struct BackupConfig { pub struct NetworkConfig { pub enabled: bool, pub interval_seconds: u64, + /// Command timeout in seconds for ip route, ip addr commands + #[serde(default = "default_network_command_timeout")] + pub command_timeout_seconds: u64, } /// Notification configuration @@ -145,6 +154,9 @@ pub struct NotificationConfig { pub rate_limit_minutes: u64, /// Email notification batching interval in seconds (default: 60) pub aggregation_interval_seconds: u64, + /// Status check interval in seconds for detecting changes (default: 30) + #[serde(default = "default_notification_check_interval")] + pub check_interval_seconds: u64, /// List of metric names to exclude from email notifications #[serde(default)] pub exclude_email_metrics: Vec, @@ -158,10 +170,26 @@ fn default_heartbeat_interval_seconds() -> u64 { 5 } +fn default_notification_check_interval() -> u64 { + 30 +} + fn default_maintenance_mode_file() -> String { "/tmp/cm-maintenance".to_string() } +fn default_disk_command_timeout() -> u64 { + 30 +} + +fn default_systemd_command_timeout() -> u64 { + 15 +} + +fn default_network_command_timeout() -> u64 { + 10 +} + impl AgentConfig { pub fn from_file>(path: P) -> Result { loader::load_config(path) diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 5f612ec..ce8d404 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.192" +version = "0.1.193" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index e0c9e6a..3949e8c 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.192" +version = "0.1.193" edition = "2021" [dependencies] From 14618c59c61b2f8d731697f01b8388ace825a809 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 22:45:44 +0100 Subject: [PATCH 03/14] Fix data duplication in cached collector architecture Critical bug fix: Collectors were appending to Vecs instead of replacing them, causing duplicate entries with each collection cycle. Fixed by adding .clear() calls before populating: - Memory collector: tmpfs Vec (was showing 11+ duplicates) - Disk collector: drives and pools Vecs - Systemd collector: services Vec - Network collector: Already correct (assigns new Vec) This prevents the exponential growth of duplicate entries in the dashboard UI. --- agent/src/collectors/disk.rs | 6 ++++++ agent/src/collectors/memory.rs | 5 ++++- agent/src/collectors/systemd.rs | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 588bec8..dfbd5fa 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -530,6 +530,9 @@ impl DiskCollector { /// Populate drives data into AgentData fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { + // Clear existing drives data to prevent duplicates in cached architecture + agent_data.system.storage.drives.clear(); + for drive in physical_drives { let smart = smart_data.get(&drive.name); @@ -567,6 +570,9 @@ impl DiskCollector { /// Populate pools data into AgentData fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { + // Clear existing pools data to prevent duplicates in cached architecture + agent_data.system.storage.pools.clear(); + for pool in mergerfs_pools { // Calculate pool health and statuses based on member drive health let (pool_health, health_status, usage_status, data_drive_data, parity_drive_data) = self.calculate_pool_health(pool, smart_data); diff --git a/agent/src/collectors/memory.rs b/agent/src/collectors/memory.rs index e186704..9151ee2 100644 --- a/agent/src/collectors/memory.rs +++ b/agent/src/collectors/memory.rs @@ -97,9 +97,12 @@ impl MemoryCollector { /// Populate tmpfs data into AgentData async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { + // Clear existing tmpfs data to prevent duplicates in cached architecture + agent_data.system.memory.tmpfs.clear(); + // Discover all tmpfs mount points let tmpfs_mounts = self.discover_tmpfs_mounts()?; - + if tmpfs_mounts.is_empty() { debug!("No tmpfs mounts found to monitor"); return Ok(()); diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 9ba8663..4e17cf4 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -915,6 +915,9 @@ impl SystemdCollector { #[async_trait] impl Collector for SystemdCollector { async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { + // Clear existing services data to prevent duplicates in cached architecture + agent_data.services.clear(); + // Use cached complete data if available and fresh if let Some(cached_complete_services) = self.get_cached_complete_services() { for service_data in cached_complete_services { From ed6399b91437163cf5ab2961f8069986c7c53e13 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 22:46:17 +0100 Subject: [PATCH 04/14] Bump version to v0.1.194 --- Cargo.lock | 6 +++--- agent/Cargo.toml | 2 +- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a78eb9..5bcd08f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.192" +version = "0.1.193" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.192" +version = "0.1.193" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.192" +version = "0.1.193" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index dcdf453..555cb45 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.193" +version = "0.1.194" edition = "2021" [dependencies] diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index ce8d404..7dfa429 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.193" +version = "0.1.194" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 3949e8c..26314cd 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.193" +version = "0.1.194" edition = "2021" [dependencies] From 01e1f33b66f3f33a448b95be73aabcfdd3a8c6d0 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 22:56:58 +0100 Subject: [PATCH 05/14] Fix ZMQ sender blocking - move to independent thread with try_read CRITICAL FIX: The previous cached collector architecture still had ZMQ sending in the main event loop, where it could block waiting for RwLock when collectors were writing. This caused the 3-8 second delays you observed. Changes: - Move ZMQ publisher to dedicated std::thread (ZMQ sockets aren't thread-safe) - Use try_read() instead of read() to avoid blocking on write locks - Send previous data if cache is locked by collector - ZMQ now sends every 2s regardless of collector timing - Remove publisher from ZmqHandler (now only handles commands) Architecture: - Collectors: Independent tokio tasks updating shared cache - ZMQ Sender: Dedicated OS thread with its own publisher socket - Main Loop: Only handles commands and notifications This ensures ZMQ transmission is NEVER blocked by slow collectors. Bump version to v0.1.195 --- Cargo.lock | 6 +-- agent/Cargo.toml | 2 +- agent/src/agent.rs | 79 +++++++++++++++++++++++++--------- agent/src/communication/mod.rs | 39 +---------------- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 6 files changed, 67 insertions(+), 63 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5bcd08f..0c683ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.193" +version = "0.1.194" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.193" +version = "0.1.194" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.193" +version = "0.1.194" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 555cb45..86a4a38 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.194" +version = "0.1.195" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index 900c7a2..3e1d5dd 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -193,31 +193,73 @@ impl Agent { pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> { info!("Starting agent main loop with cached collector architecture"); - // Set up intervals from config - let mut transmission_interval = interval(Duration::from_secs( - self.config.collection_interval_seconds, - )); + // Spawn independent ZMQ sender task + // Create dedicated ZMQ publisher for the sender task + let cache_clone = self.cache.clone(); + let publisher_config = self.config.zmq.clone(); + let transmission_interval_secs = self.config.collection_interval_seconds; + + std::thread::spawn(move || { + // Create ZMQ publisher in this thread (ZMQ sockets are not thread-safe) + let context = zmq::Context::new(); + let publisher = context.socket(zmq::SocketType::PUB).unwrap(); + let bind_address = format!("tcp://{}:{}", publisher_config.bind_address, publisher_config.publisher_port); + publisher.bind(&bind_address).unwrap(); + publisher.set_sndhwm(1000).unwrap(); + publisher.set_linger(1000).unwrap(); + info!("ZMQ sender task started on {} (interval: {}s)", bind_address, transmission_interval_secs); + + let mut last_sent_data: Option = None; + let interval_duration = std::time::Duration::from_secs(transmission_interval_secs); + let mut next_send = std::time::Instant::now() + interval_duration; + + loop { + // Sleep until next send time + std::thread::sleep(next_send.saturating_duration_since(std::time::Instant::now())); + next_send = std::time::Instant::now() + interval_duration; + + // Try to read cache without blocking - if locked, send last known data + let data_to_send = match cache_clone.try_read() { + Ok(agent_data) => { + let data_clone = agent_data.clone(); + drop(agent_data); // Release lock immediately + last_sent_data = Some(data_clone.clone()); + Some(data_clone) + } + Err(_) => { + // Lock is held by collector - use last sent data + debug!("Cache locked by collector, sending previous data"); + last_sent_data.clone() + } + }; + + if let Some(data) = data_to_send { + // Publish via ZMQ + if let Ok(envelope) = cm_dashboard_shared::MessageEnvelope::agent_data(data) { + if let Ok(serialized) = serde_json::to_vec(&envelope) { + if let Err(e) = publisher.send(&serialized, 0) { + error!("Failed to send ZMQ message: {}", e); + } else { + debug!("Successfully broadcast agent data"); + } + } + } + } + } + }); + + // Set up intervals for notifications and commands let mut notification_interval = interval(Duration::from_secs( self.config.notifications.check_interval_seconds, )); let mut command_interval = interval(Duration::from_millis(100)); // Skip initial ticks - transmission_interval.tick().await; notification_interval.tick().await; command_interval.tick().await; loop { tokio::select! { - _ = transmission_interval.tick() => { - // Read current cache state and broadcast via ZMQ - let agent_data = self.cache.read().await.clone(); - if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { - error!("Failed to broadcast agent data: {}", e); - } else { - debug!("Successfully broadcast agent data"); - } - } _ = notification_interval.tick() => { // Read cache and check for status changes let agent_data = self.cache.read().await.clone(); @@ -329,12 +371,9 @@ impl Agent { match command { AgentCommand::CollectNow => { info!("Received immediate transmission request"); - // With cached architecture, collectors run independently - // Just send current cache state immediately - let agent_data = self.cache.read().await.clone(); - if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { - error!("Failed to broadcast on demand: {}", e); - } + // With cached architecture and dedicated ZMQ sender thread, + // data is already being sent every interval + // This command is acknowledged but not actionable in new architecture } AgentCommand::SetInterval { seconds } => { info!("Received interval change request: {}s", seconds); diff --git a/agent/src/communication/mod.rs b/agent/src/communication/mod.rs index c364f7c..591f09f 100644 --- a/agent/src/communication/mod.rs +++ b/agent/src/communication/mod.rs @@ -1,13 +1,12 @@ use anyhow::Result; -use cm_dashboard_shared::{AgentData, MessageEnvelope}; use tracing::{debug, info}; use zmq::{Context, Socket, SocketType}; use crate::config::ZmqConfig; -/// ZMQ communication handler for publishing metrics and receiving commands +/// ZMQ communication handler for receiving commands +/// NOTE: Publishing is handled by dedicated thread in Agent::run() pub struct ZmqHandler { - publisher: Socket, command_receiver: Socket, } @@ -15,17 +14,6 @@ impl ZmqHandler { pub async fn new(config: &ZmqConfig) -> Result { let context = Context::new(); - // Create publisher socket for metrics - let publisher = context.socket(SocketType::PUB)?; - let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port); - publisher.bind(&pub_bind_address)?; - - info!("ZMQ publisher bound to {}", pub_bind_address); - - // Set socket options for efficiency - publisher.set_sndhwm(1000)?; // High water mark for outbound messages - publisher.set_linger(1000)?; // Linger time on close - // Create command receiver socket (PULL socket to receive commands from dashboard) let command_receiver = context.socket(SocketType::PULL)?; let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port); @@ -38,33 +26,10 @@ impl ZmqHandler { command_receiver.set_linger(1000)?; Ok(Self { - publisher, command_receiver, }) } - - /// Publish agent data via ZMQ - pub async fn publish_agent_data(&self, data: &AgentData) -> Result<()> { - debug!( - "Publishing agent data for host {}", - data.hostname - ); - - // Create message envelope for agent data - let envelope = MessageEnvelope::agent_data(data.clone()) - .map_err(|e| anyhow::anyhow!("Failed to create agent data envelope: {}", e))?; - - // Serialize envelope - let serialized = serde_json::to_vec(&envelope)?; - - // Send via ZMQ - self.publisher.send(&serialized, 0)?; - - debug!("Published agent data message ({} bytes)", serialized.len()); - Ok(()) - } - /// Try to receive a command (non-blocking) pub fn try_receive_command(&self) -> Result> { match self.command_receiver.recv_bytes(zmq::DONTWAIT) { diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 7dfa429..a7da6d6 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.194" +version = "0.1.195" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 26314cd..9dd1a6d 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.194" +version = "0.1.195" edition = "2021" [dependencies] From 43dd5a901a69fab5bab4662b33ce38cc9d5c74b9 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 22:59:38 +0100 Subject: [PATCH 06/14] Update CLAUDE.md with correct ZMQ sender architecture --- CLAUDE.md | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 32fe7aa..b50be71 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -207,10 +207,16 @@ Every 1 second: - Recommended: Slow (60-300s): Disk, Systemd - **Independent tasks**: Each collector spawned as separate tokio task in `Agent::new()` - **Cache updates**: Collectors acquire write lock → update → release immediately -- **ZMQ sender**: Main loop reads cache every `collection_interval_seconds` and broadcasts -- **Notification check**: Runs every `notifications.check_interval_seconds` -- **Lock strategy**: Short-lived write locks prevent blocking, read locks for transmission -- **Stale data**: Acceptable for slow-changing metrics (SMART data, disk usage) +- **ZMQ sender**: Dedicated OS thread with own publisher socket, uses `try_read()` to avoid blocking +- **Non-blocking reads**: `try_read()` never blocks - sends previous data if cache is locked +- **Notification check**: Runs every `notifications.check_interval_seconds` in main loop +- **Lock strategy**: Short-lived write locks, non-blocking reads for transmission +- **Stale data**: Acceptable for slow-changing metrics AND when collector holds write lock + +**Threading Model:** +- Main thread: tokio runtime for command handling and notifications +- Collector threads: 7 independent tokio tasks updating shared cache +- ZMQ sender thread: Dedicated OS thread (ZMQ sockets not thread-safe) with lock-free reads **Configuration (NixOS):** All intervals and timeouts configurable in `services/cm-dashboard.nix`: @@ -232,9 +238,10 @@ Command Timeouts (prevent resource leaks from hung commands): - `collectors.network.command_timeout_seconds` (default: 10s) - ip route, ip addr **Code Locations:** -- agent/src/agent.rs:59-133 - Collector task spawning -- agent/src/agent.rs:151-179 - Independent collector task runner -- agent/src/agent.rs:199-207 - ZMQ sender in main loop +- agent/src/agent.rs:69-144 - Collector task spawning with configured intervals +- agent/src/agent.rs:162-190 - Independent collector task runner +- agent/src/agent.rs:202-249 - ZMQ sender in dedicated OS thread with try_read() +- agent/src/agent.rs:251-264 - Main loop (commands and notifications only) ### Maintenance Mode From f09ccabc7fef01c71bf734c898ad6ec6538352c8 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 23:09:40 +0100 Subject: [PATCH 07/14] Revert "Fix data duplication in cached collector architecture" This reverts commit 14618c59c61b2f8d731697f01b8388ace825a809. --- CLAUDE.md | 21 +++------ Cargo.lock | 6 +-- agent/Cargo.toml | 2 +- agent/src/agent.rs | 79 +++++++++------------------------ agent/src/collectors/disk.rs | 6 --- agent/src/collectors/memory.rs | 5 +-- agent/src/collectors/systemd.rs | 3 -- agent/src/communication/mod.rs | 39 +++++++++++++++- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 10 files changed, 71 insertions(+), 94 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index b50be71..32fe7aa 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -207,16 +207,10 @@ Every 1 second: - Recommended: Slow (60-300s): Disk, Systemd - **Independent tasks**: Each collector spawned as separate tokio task in `Agent::new()` - **Cache updates**: Collectors acquire write lock → update → release immediately -- **ZMQ sender**: Dedicated OS thread with own publisher socket, uses `try_read()` to avoid blocking -- **Non-blocking reads**: `try_read()` never blocks - sends previous data if cache is locked -- **Notification check**: Runs every `notifications.check_interval_seconds` in main loop -- **Lock strategy**: Short-lived write locks, non-blocking reads for transmission -- **Stale data**: Acceptable for slow-changing metrics AND when collector holds write lock - -**Threading Model:** -- Main thread: tokio runtime for command handling and notifications -- Collector threads: 7 independent tokio tasks updating shared cache -- ZMQ sender thread: Dedicated OS thread (ZMQ sockets not thread-safe) with lock-free reads +- **ZMQ sender**: Main loop reads cache every `collection_interval_seconds` and broadcasts +- **Notification check**: Runs every `notifications.check_interval_seconds` +- **Lock strategy**: Short-lived write locks prevent blocking, read locks for transmission +- **Stale data**: Acceptable for slow-changing metrics (SMART data, disk usage) **Configuration (NixOS):** All intervals and timeouts configurable in `services/cm-dashboard.nix`: @@ -238,10 +232,9 @@ Command Timeouts (prevent resource leaks from hung commands): - `collectors.network.command_timeout_seconds` (default: 10s) - ip route, ip addr **Code Locations:** -- agent/src/agent.rs:69-144 - Collector task spawning with configured intervals -- agent/src/agent.rs:162-190 - Independent collector task runner -- agent/src/agent.rs:202-249 - ZMQ sender in dedicated OS thread with try_read() -- agent/src/agent.rs:251-264 - Main loop (commands and notifications only) +- agent/src/agent.rs:59-133 - Collector task spawning +- agent/src/agent.rs:151-179 - Independent collector task runner +- agent/src/agent.rs:199-207 - ZMQ sender in main loop ### Maintenance Mode diff --git a/Cargo.lock b/Cargo.lock index 0c683ca..2a78eb9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -279,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.194" +version = "0.1.192" dependencies = [ "anyhow", "chrono", @@ -301,7 +301,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.194" +version = "0.1.192" dependencies = [ "anyhow", "async-trait", @@ -324,7 +324,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.194" +version = "0.1.192" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 86a4a38..dcdf453 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.195" +version = "0.1.193" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index 3e1d5dd..900c7a2 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -193,73 +193,31 @@ impl Agent { pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> { info!("Starting agent main loop with cached collector architecture"); - // Spawn independent ZMQ sender task - // Create dedicated ZMQ publisher for the sender task - let cache_clone = self.cache.clone(); - let publisher_config = self.config.zmq.clone(); - let transmission_interval_secs = self.config.collection_interval_seconds; - - std::thread::spawn(move || { - // Create ZMQ publisher in this thread (ZMQ sockets are not thread-safe) - let context = zmq::Context::new(); - let publisher = context.socket(zmq::SocketType::PUB).unwrap(); - let bind_address = format!("tcp://{}:{}", publisher_config.bind_address, publisher_config.publisher_port); - publisher.bind(&bind_address).unwrap(); - publisher.set_sndhwm(1000).unwrap(); - publisher.set_linger(1000).unwrap(); - info!("ZMQ sender task started on {} (interval: {}s)", bind_address, transmission_interval_secs); - - let mut last_sent_data: Option = None; - let interval_duration = std::time::Duration::from_secs(transmission_interval_secs); - let mut next_send = std::time::Instant::now() + interval_duration; - - loop { - // Sleep until next send time - std::thread::sleep(next_send.saturating_duration_since(std::time::Instant::now())); - next_send = std::time::Instant::now() + interval_duration; - - // Try to read cache without blocking - if locked, send last known data - let data_to_send = match cache_clone.try_read() { - Ok(agent_data) => { - let data_clone = agent_data.clone(); - drop(agent_data); // Release lock immediately - last_sent_data = Some(data_clone.clone()); - Some(data_clone) - } - Err(_) => { - // Lock is held by collector - use last sent data - debug!("Cache locked by collector, sending previous data"); - last_sent_data.clone() - } - }; - - if let Some(data) = data_to_send { - // Publish via ZMQ - if let Ok(envelope) = cm_dashboard_shared::MessageEnvelope::agent_data(data) { - if let Ok(serialized) = serde_json::to_vec(&envelope) { - if let Err(e) = publisher.send(&serialized, 0) { - error!("Failed to send ZMQ message: {}", e); - } else { - debug!("Successfully broadcast agent data"); - } - } - } - } - } - }); - - // Set up intervals for notifications and commands + // Set up intervals from config + let mut transmission_interval = interval(Duration::from_secs( + self.config.collection_interval_seconds, + )); let mut notification_interval = interval(Duration::from_secs( self.config.notifications.check_interval_seconds, )); let mut command_interval = interval(Duration::from_millis(100)); // Skip initial ticks + transmission_interval.tick().await; notification_interval.tick().await; command_interval.tick().await; loop { tokio::select! { + _ = transmission_interval.tick() => { + // Read current cache state and broadcast via ZMQ + let agent_data = self.cache.read().await.clone(); + if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { + error!("Failed to broadcast agent data: {}", e); + } else { + debug!("Successfully broadcast agent data"); + } + } _ = notification_interval.tick() => { // Read cache and check for status changes let agent_data = self.cache.read().await.clone(); @@ -371,9 +329,12 @@ impl Agent { match command { AgentCommand::CollectNow => { info!("Received immediate transmission request"); - // With cached architecture and dedicated ZMQ sender thread, - // data is already being sent every interval - // This command is acknowledged but not actionable in new architecture + // With cached architecture, collectors run independently + // Just send current cache state immediately + let agent_data = self.cache.read().await.clone(); + if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { + error!("Failed to broadcast on demand: {}", e); + } } AgentCommand::SetInterval { seconds } => { info!("Received interval change request: {}s", seconds); diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index dfbd5fa..588bec8 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -530,9 +530,6 @@ impl DiskCollector { /// Populate drives data into AgentData fn populate_drives_data(&self, physical_drives: &[PhysicalDrive], smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { - // Clear existing drives data to prevent duplicates in cached architecture - agent_data.system.storage.drives.clear(); - for drive in physical_drives { let smart = smart_data.get(&drive.name); @@ -570,9 +567,6 @@ impl DiskCollector { /// Populate pools data into AgentData fn populate_pools_data(&self, mergerfs_pools: &[MergerfsPool], smart_data: &HashMap, agent_data: &mut AgentData) -> Result<(), CollectorError> { - // Clear existing pools data to prevent duplicates in cached architecture - agent_data.system.storage.pools.clear(); - for pool in mergerfs_pools { // Calculate pool health and statuses based on member drive health let (pool_health, health_status, usage_status, data_drive_data, parity_drive_data) = self.calculate_pool_health(pool, smart_data); diff --git a/agent/src/collectors/memory.rs b/agent/src/collectors/memory.rs index 9151ee2..e186704 100644 --- a/agent/src/collectors/memory.rs +++ b/agent/src/collectors/memory.rs @@ -97,12 +97,9 @@ impl MemoryCollector { /// Populate tmpfs data into AgentData async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { - // Clear existing tmpfs data to prevent duplicates in cached architecture - agent_data.system.memory.tmpfs.clear(); - // Discover all tmpfs mount points let tmpfs_mounts = self.discover_tmpfs_mounts()?; - + if tmpfs_mounts.is_empty() { debug!("No tmpfs mounts found to monitor"); return Ok(()); diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 4e17cf4..9ba8663 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -915,9 +915,6 @@ impl SystemdCollector { #[async_trait] impl Collector for SystemdCollector { async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { - // Clear existing services data to prevent duplicates in cached architecture - agent_data.services.clear(); - // Use cached complete data if available and fresh if let Some(cached_complete_services) = self.get_cached_complete_services() { for service_data in cached_complete_services { diff --git a/agent/src/communication/mod.rs b/agent/src/communication/mod.rs index 591f09f..c364f7c 100644 --- a/agent/src/communication/mod.rs +++ b/agent/src/communication/mod.rs @@ -1,12 +1,13 @@ use anyhow::Result; +use cm_dashboard_shared::{AgentData, MessageEnvelope}; use tracing::{debug, info}; use zmq::{Context, Socket, SocketType}; use crate::config::ZmqConfig; -/// ZMQ communication handler for receiving commands -/// NOTE: Publishing is handled by dedicated thread in Agent::run() +/// ZMQ communication handler for publishing metrics and receiving commands pub struct ZmqHandler { + publisher: Socket, command_receiver: Socket, } @@ -14,6 +15,17 @@ impl ZmqHandler { pub async fn new(config: &ZmqConfig) -> Result { let context = Context::new(); + // Create publisher socket for metrics + let publisher = context.socket(SocketType::PUB)?; + let pub_bind_address = format!("tcp://{}:{}", config.bind_address, config.publisher_port); + publisher.bind(&pub_bind_address)?; + + info!("ZMQ publisher bound to {}", pub_bind_address); + + // Set socket options for efficiency + publisher.set_sndhwm(1000)?; // High water mark for outbound messages + publisher.set_linger(1000)?; // Linger time on close + // Create command receiver socket (PULL socket to receive commands from dashboard) let command_receiver = context.socket(SocketType::PULL)?; let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port); @@ -26,10 +38,33 @@ impl ZmqHandler { command_receiver.set_linger(1000)?; Ok(Self { + publisher, command_receiver, }) } + + /// Publish agent data via ZMQ + pub async fn publish_agent_data(&self, data: &AgentData) -> Result<()> { + debug!( + "Publishing agent data for host {}", + data.hostname + ); + + // Create message envelope for agent data + let envelope = MessageEnvelope::agent_data(data.clone()) + .map_err(|e| anyhow::anyhow!("Failed to create agent data envelope: {}", e))?; + + // Serialize envelope + let serialized = serde_json::to_vec(&envelope)?; + + // Send via ZMQ + self.publisher.send(&serialized, 0)?; + + debug!("Published agent data message ({} bytes)", serialized.len()); + Ok(()) + } + /// Try to receive a command (non-blocking) pub fn try_receive_command(&self) -> Result> { match self.command_receiver.recv_bytes(zmq::DONTWAIT) { diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index a7da6d6..ce8d404 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.195" +version = "0.1.193" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 9dd1a6d..3949e8c 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.195" +version = "0.1.193" edition = "2021" [dependencies] From 3c2955376d1c813aced8e0f0c0aa6b52b208f71c Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 23:10:55 +0100 Subject: [PATCH 08/14] Revert "Fix ZMQ sender blocking - move to independent thread with try_read" This reverts commit 01e1f33b66f3f33a448b95be73aabcfdd3a8c6d0. --- Cargo.lock | 51 ++++++++++++++++++++++---------------------- agent/Cargo.toml | 2 +- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 4 files changed, 28 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2a78eb9..6dfaddb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,5 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -# This file is automatically generated by Cargo. version = 4 [[package]] @@ -165,9 +164,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53" [[package]] name = "cc" -version = "1.2.46" +version = "1.2.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36" +checksum = "cd405d82c84ff7f35739f175f67d8b9fb7687a0e84ccdc78bd3568839827cf07" dependencies = [ "find-msvc-tools", "jobserver", @@ -239,9 +238,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.52" +version = "4.5.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8" +checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8" dependencies = [ "clap_builder", "clap_derive", @@ -249,9 +248,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.52" +version = "4.5.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1" +checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00" dependencies = [ "anstream", "anstyle", @@ -664,9 +663,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.16.0" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" [[package]] name = "heck" @@ -879,12 +878,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.12.0" +version = "2.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" +checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" dependencies = [ "equivalent", - "hashbrown 0.16.0", + "hashbrown 0.16.1", ] [[package]] @@ -1639,9 +1638,9 @@ dependencies = [ [[package]] name = "signal-hook-registry" -version = "1.4.6" +version = "1.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +checksum = "7664a098b8e616bdfcc2dc0e9ac44eb231eedf41db4e9fe95d8d32ec728dedad" dependencies = [ "libc", ] @@ -1733,9 +1732,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.110" +version = "2.0.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" +checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87" dependencies = [ "proc-macro2", "quote", @@ -1962,9 +1961,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", @@ -1973,9 +1972,9 @@ dependencies = [ [[package]] name = "tracing-core" -version = "0.1.34" +version = "0.1.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" +checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c" dependencies = [ "once_cell", "valuable", @@ -2513,9 +2512,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.13" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" dependencies = [ "memchr", ] @@ -2567,18 +2566,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.27" +version = "0.8.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c" +checksum = "4ea879c944afe8a2b25fef16bb4ba234f47c694565e97383b36f3a878219065c" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.27" +version = "0.8.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831" +checksum = "cf955aa904d6040f70dc8e9384444cb1030aed272ba3cb09bbc4ab9e7c1f34f5" dependencies = [ "proc-macro2", "quote", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index dcdf453..c4f70ac 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.193" +version = "0.1.192" edition = "2021" [dependencies] diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index ce8d404..5f612ec 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.193" +version = "0.1.192" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 3949e8c..e0c9e6a 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.193" +version = "0.1.192" edition = "2021" [dependencies] From 0db1a165b90b6e5480e6018e91f962b2adf4085a Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 23:12:08 +0100 Subject: [PATCH 09/14] Revert "Implement cached collector architecture with configurable timeouts" This reverts commit 2740de9b54e5239acfe9d788f9e6a877f7274331. --- CLAUDE.md | 44 ++----- Cargo.lock | 6 +- agent/src/agent.rs | 216 +++++++++++--------------------- agent/src/collectors/disk.rs | 2 +- agent/src/collectors/network.rs | 14 +-- agent/src/collectors/systemd.rs | 39 +++--- agent/src/config/mod.rs | 28 ----- 7 files changed, 110 insertions(+), 239 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 32fe7aa..3786ee8 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -156,7 +156,7 @@ Complete migration from string-based metrics to structured JSON data. Eliminates - ✅ Backward compatibility via bridge conversion to existing UI widgets - ✅ All string parsing bugs eliminated -### Cached Collector Architecture (✅ IMPLEMENTED) +### Cached Collector Architecture (🚧 PLANNED) **Problem:** Blocking collectors prevent timely ZMQ transmission, causing false "host offline" alerts. @@ -199,42 +199,12 @@ Every 1 second: - ✅ System stays responsive even with slow operations - ✅ Slow collectors can use longer timeouts without blocking -**Implementation Details:** -- **Shared cache**: `Arc>` initialized at agent startup -- **Collector intervals**: Fully configurable via NixOS config (`interval_seconds` per collector) - - Recommended: Fast (1-10s): CPU, Memory, Network - - Recommended: Medium (30-60s): Backup, NixOS - - Recommended: Slow (60-300s): Disk, Systemd -- **Independent tasks**: Each collector spawned as separate tokio task in `Agent::new()` -- **Cache updates**: Collectors acquire write lock → update → release immediately -- **ZMQ sender**: Main loop reads cache every `collection_interval_seconds` and broadcasts -- **Notification check**: Runs every `notifications.check_interval_seconds` -- **Lock strategy**: Short-lived write locks prevent blocking, read locks for transmission -- **Stale data**: Acceptable for slow-changing metrics (SMART data, disk usage) - -**Configuration (NixOS):** -All intervals and timeouts configurable in `services/cm-dashboard.nix`: - -Collection Intervals: -- `collectors.cpu.interval_seconds` (default: 10s) -- `collectors.memory.interval_seconds` (default: 2s) -- `collectors.disk.interval_seconds` (default: 300s) -- `collectors.systemd.interval_seconds` (default: 10s) -- `collectors.backup.interval_seconds` (default: 60s) -- `collectors.network.interval_seconds` (default: 10s) -- `collectors.nixos.interval_seconds` (default: 60s) -- `notifications.check_interval_seconds` (default: 30s) -- `collection_interval_seconds` - ZMQ transmission rate (default: 2s) - -Command Timeouts (prevent resource leaks from hung commands): -- `collectors.disk.command_timeout_seconds` (default: 30s) - lsblk, smartctl, etc. -- `collectors.systemd.command_timeout_seconds` (default: 15s) - systemctl, docker, du -- `collectors.network.command_timeout_seconds` (default: 10s) - ip route, ip addr - -**Code Locations:** -- agent/src/agent.rs:59-133 - Collector task spawning -- agent/src/agent.rs:151-179 - Independent collector task runner -- agent/src/agent.rs:199-207 - ZMQ sender in main loop +**Implementation:** +- Shared `AgentData` cache wrapped in `Arc>` +- Each collector spawned as independent tokio task +- Collectors update their section of cache at their own rate +- ZMQ sender reads cache every 1s and transmits +- Stale data acceptable for slow-changing metrics (disk usage, SMART) ### Maintenance Mode diff --git a/Cargo.lock b/Cargo.lock index 6dfaddb..343050f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -278,7 +278,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.192" +version = "0.1.191" dependencies = [ "anyhow", "chrono", @@ -300,7 +300,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.192" +version = "0.1.191" dependencies = [ "anyhow", "async-trait", @@ -323,7 +323,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.192" +version = "0.1.191" dependencies = [ "chrono", "serde", diff --git a/agent/src/agent.rs b/agent/src/agent.rs index 900c7a2..d74a4c7 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -1,14 +1,13 @@ use anyhow::Result; use gethostname::gethostname; -use std::sync::Arc; use std::time::Duration; -use tokio::sync::RwLock; use tokio::time::interval; use tracing::{debug, error, info}; use crate::communication::{AgentCommand, ZmqHandler}; use crate::config::AgentConfig; use crate::collectors::{ + Collector, backup::BackupCollector, cpu::CpuCollector, disk::DiskCollector, @@ -24,7 +23,7 @@ pub struct Agent { hostname: String, config: AgentConfig, zmq_handler: ZmqHandler, - cache: Arc>, + collectors: Vec>, notification_manager: NotificationManager, previous_status: Option, } @@ -56,94 +55,39 @@ impl Agent { config.zmq.publisher_port ); - // Initialize shared cache - let cache = Arc::new(RwLock::new(AgentData::new( - hostname.clone(), - env!("CARGO_PKG_VERSION").to_string() - ))); - info!("Initialized shared agent data cache"); - - // Spawn independent collector tasks - let mut collector_count = 0; - - // CPU collector + // Initialize collectors + let mut collectors: Vec> = Vec::new(); + + // Add enabled collectors if config.collectors.cpu.enabled { - let cache_clone = cache.clone(); - let collector = CpuCollector::new(config.collectors.cpu.clone()); - let interval = config.collectors.cpu.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "CPU").await; - }); - collector_count += 1; + collectors.push(Box::new(CpuCollector::new(config.collectors.cpu.clone()))); } - - // Memory collector + if config.collectors.memory.enabled { - let cache_clone = cache.clone(); - let collector = MemoryCollector::new(config.collectors.memory.clone()); - let interval = config.collectors.memory.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Memory").await; - }); - collector_count += 1; + collectors.push(Box::new(MemoryCollector::new(config.collectors.memory.clone()))); } - - // Network collector - if config.collectors.network.enabled { - let cache_clone = cache.clone(); - let collector = NetworkCollector::new(config.collectors.network.clone()); - let interval = config.collectors.network.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Network").await; - }); - collector_count += 1; - } - - // Backup collector - if config.collectors.backup.enabled { - let cache_clone = cache.clone(); - let collector = BackupCollector::new(); - let interval = config.collectors.backup.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Backup").await; - }); - collector_count += 1; - } - - // NixOS collector - if config.collectors.nixos.enabled { - let cache_clone = cache.clone(); - let collector = NixOSCollector::new(config.collectors.nixos.clone()); - let interval = config.collectors.nixos.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "NixOS").await; - }); - collector_count += 1; - } - - // Disk collector + if config.collectors.disk.enabled { - let cache_clone = cache.clone(); - let collector = DiskCollector::new(config.collectors.disk.clone()); - let interval = config.collectors.disk.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Disk").await; - }); - collector_count += 1; + collectors.push(Box::new(DiskCollector::new(config.collectors.disk.clone()))); } - - // Systemd collector + if config.collectors.systemd.enabled { - let cache_clone = cache.clone(); - let collector = SystemdCollector::new(config.collectors.systemd.clone()); - let interval = config.collectors.systemd.interval_seconds; - tokio::spawn(async move { - Self::run_collector_task(cache_clone, collector, Duration::from_secs(interval), "Systemd").await; - }); - collector_count += 1; + collectors.push(Box::new(SystemdCollector::new(config.collectors.systemd.clone()))); + } + + if config.collectors.backup.enabled { + collectors.push(Box::new(BackupCollector::new())); } - info!("Spawned {} independent collector tasks", collector_count); + if config.collectors.network.enabled { + collectors.push(Box::new(NetworkCollector::new(config.collectors.network.clone()))); + } + + if config.collectors.nixos.enabled { + collectors.push(Box::new(NixOSCollector::new(config.collectors.nixos.clone()))); + } + + info!("Initialized {} collectors", collectors.len()); // Initialize notification manager let notification_manager = NotificationManager::new(&config.notifications, &hostname)?; @@ -153,79 +97,45 @@ impl Agent { hostname, config, zmq_handler, - cache, + collectors, notification_manager, previous_status: None, }) } - /// Independent collector task runner - async fn run_collector_task( - cache: Arc>, - collector: C, - interval_duration: Duration, - name: &str, - ) where - C: crate::collectors::Collector + Send + 'static, - { - let mut interval_timer = interval(interval_duration); - info!("{} collector task started (interval: {:?})", name, interval_duration); - - loop { - interval_timer.tick().await; - - // Acquire write lock and update cache - { - let mut agent_data = cache.write().await; - match collector.collect_structured(&mut *agent_data).await { - Ok(_) => { - debug!("{} collector updated cache", name); - } - Err(e) => { - error!("{} collector failed: {}", name, e); - } - } - } // Release lock immediately after collection - } - } - - /// Main agent loop with cached data architecture + /// Main agent loop with structured data collection pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> { - info!("Starting agent main loop with cached collector architecture"); + info!("Starting agent main loop"); - // Set up intervals from config + // Initial collection + if let Err(e) = self.collect_and_broadcast().await { + error!("Initial metric collection failed: {}", e); + } + + // Set up intervals let mut transmission_interval = interval(Duration::from_secs( self.config.collection_interval_seconds, )); - let mut notification_interval = interval(Duration::from_secs( - self.config.notifications.check_interval_seconds, - )); - let mut command_interval = interval(Duration::from_millis(100)); + let mut notification_interval = interval(Duration::from_secs(30)); // Check notifications every 30s - // Skip initial ticks + // Skip initial ticks to avoid immediate execution transmission_interval.tick().await; notification_interval.tick().await; - command_interval.tick().await; loop { tokio::select! { _ = transmission_interval.tick() => { - // Read current cache state and broadcast via ZMQ - let agent_data = self.cache.read().await.clone(); - if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { - error!("Failed to broadcast agent data: {}", e); - } else { - debug!("Successfully broadcast agent data"); + if let Err(e) = self.collect_and_broadcast().await { + error!("Failed to collect and broadcast metrics: {}", e); } } _ = notification_interval.tick() => { - // Read cache and check for status changes - let agent_data = self.cache.read().await.clone(); - if let Err(e) = self.check_status_changes_and_notify(&agent_data).await { - error!("Failed to check status changes: {}", e); - } + // Process any pending notifications + // NOTE: With structured data, we might need to implement status tracking differently + // For now, we skip this until status evaluation is migrated } - _ = command_interval.tick() => { + // Handle incoming commands (check periodically) + _ = tokio::time::sleep(Duration::from_millis(100)) => { if let Err(e) = self.handle_commands().await { error!("Error handling commands: {}", e); } @@ -241,6 +151,35 @@ impl Agent { Ok(()) } + /// Collect structured data from all collectors and broadcast via ZMQ + async fn collect_and_broadcast(&mut self) -> Result<()> { + debug!("Starting structured data collection"); + + // Initialize empty AgentData + let mut agent_data = AgentData::new(self.hostname.clone(), env!("CARGO_PKG_VERSION").to_string()); + + // Collect data from all collectors + for collector in &self.collectors { + if let Err(e) = collector.collect_structured(&mut agent_data).await { + error!("Collector failed: {}", e); + // Continue with other collectors even if one fails + } + } + + // Check for status changes and send notifications + if let Err(e) = self.check_status_changes_and_notify(&agent_data).await { + error!("Failed to check status changes: {}", e); + } + + // Broadcast the structured data via ZMQ + if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { + error!("Failed to broadcast agent data: {}", e); + } else { + debug!("Successfully broadcast structured agent data"); + } + + Ok(()) + } /// Check for status changes and send notifications async fn check_status_changes_and_notify(&mut self, agent_data: &AgentData) -> Result<()> { @@ -328,12 +267,9 @@ impl Agent { match command { AgentCommand::CollectNow => { - info!("Received immediate transmission request"); - // With cached architecture, collectors run independently - // Just send current cache state immediately - let agent_data = self.cache.read().await.clone(); - if let Err(e) = self.zmq_handler.publish_agent_data(&agent_data).await { - error!("Failed to broadcast on demand: {}", e); + info!("Received immediate collection request"); + if let Err(e) = self.collect_and_broadcast().await { + error!("Failed to collect on demand: {}", e); } } AgentCommand::SetInterval { seconds } => { diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 588bec8..71c53cf 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -117,7 +117,7 @@ impl DiskCollector { let mut cmd = Command::new("lsblk"); cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); - let output = run_command_with_timeout(cmd, self.config.command_timeout_seconds).await + let output = run_command_with_timeout(cmd, 2).await .map_err(|e| CollectorError::SystemRead { path: "block devices".to_string(), error: e.to_string(), diff --git a/agent/src/collectors/network.rs b/agent/src/collectors/network.rs index 5a26b05..fd4dbe2 100644 --- a/agent/src/collectors/network.rs +++ b/agent/src/collectors/network.rs @@ -8,12 +8,12 @@ use crate::config::NetworkConfig; /// Network interface collector with physical/virtual classification and link status pub struct NetworkCollector { - config: NetworkConfig, + _config: NetworkConfig, } impl NetworkCollector { pub fn new(config: NetworkConfig) -> Self { - Self { config } + Self { _config: config } } /// Check if interface is physical (not virtual) @@ -50,9 +50,8 @@ impl NetworkCollector { } /// Get the primary physical interface (the one with default route) - fn get_primary_physical_interface(&self) -> Option { - let timeout_str = self.config.command_timeout_seconds.to_string(); - match Command::new("timeout").args([&timeout_str, "ip", "route", "show", "default"]).output() { + fn get_primary_physical_interface() -> Option { + match Command::new("timeout").args(["2", "ip", "route", "show", "default"]).output() { Ok(output) if output.status.success() => { let output_str = String::from_utf8_lossy(&output.stdout); // Parse: "default via 192.168.1.1 dev eno1 ..." @@ -111,8 +110,7 @@ impl NetworkCollector { // Parse VLAN configuration let vlan_map = Self::parse_vlan_config(); - let timeout_str = self.config.command_timeout_seconds.to_string(); - match Command::new("timeout").args([&timeout_str, "ip", "-j", "addr"]).output() { + match Command::new("timeout").args(["2", "ip", "-j", "addr"]).output() { Ok(output) if output.status.success() => { let json_str = String::from_utf8_lossy(&output.stdout); @@ -197,7 +195,7 @@ impl NetworkCollector { } // Assign primary physical interface as parent to virtual interfaces without explicit parent - let primary_interface = self.get_primary_physical_interface(); + let primary_interface = Self::get_primary_physical_interface(); if let Some(primary) = primary_interface { for interface in interfaces.iter_mut() { // Only assign parent to virtual interfaces that don't already have one diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 9ba8663..bcaa6be 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -254,19 +254,18 @@ impl SystemdCollector { /// Auto-discover interesting services to monitor fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { - // First: Get all service unit files - let timeout_str = self.config.command_timeout_seconds.to_string(); + // First: Get all service unit files (with 3 second timeout) let unit_files_output = Command::new("timeout") - .args(&[&timeout_str, "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) + .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) .output()?; if !unit_files_output.status.success() { return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); } - // Second: Get runtime status of all units + // Second: Get runtime status of all units (with 3 second timeout) let units_status_output = Command::new("timeout") - .args(&[&timeout_str, "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) + .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) .output()?; if !units_status_output.status.success() { @@ -362,17 +361,16 @@ impl SystemdCollector { } } - // Fallback to systemctl if not in cache - let timeout_str = self.config.command_timeout_seconds.to_string(); + // Fallback to systemctl if not in cache (with 2 second timeout) let output = Command::new("timeout") - .args(&[&timeout_str, "systemctl", "is-active", &format!("{}.service", service)]) + .args(&["2", "systemctl", "is-active", &format!("{}.service", service)]) .output()?; let active_status = String::from_utf8(output.stdout)?.trim().to_string(); - // Get more detailed info + // Get more detailed info (with 2 second timeout) let output = Command::new("timeout") - .args(&[&timeout_str, "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) + .args(&["2", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) .output()?; let detailed_info = String::from_utf8(output.stdout)?; @@ -432,10 +430,9 @@ impl SystemdCollector { return Ok(0.0); } - // No configured path - try to get WorkingDirectory from systemctl - let timeout_str = self.config.command_timeout_seconds.to_string(); + // No configured path - try to get WorkingDirectory from systemctl (with 2 second timeout) let output = Command::new("timeout") - .args(&[&timeout_str, "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) + .args(&["2", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) .output() .map_err(|e| CollectorError::SystemRead { path: format!("WorkingDirectory for {}", service_name), @@ -455,15 +452,15 @@ impl SystemdCollector { Ok(0.0) } - /// Get size of a directory in GB + /// Get size of a directory in GB (with 2 second timeout) async fn get_directory_size(&self, path: &str) -> Option { use super::run_command_with_timeout; - // Use -s (summary) and --apparent-size for speed + // Use -s (summary) and --apparent-size for speed, 2 second timeout let mut cmd = Command::new("sudo"); cmd.args(&["du", "-s", "--apparent-size", "--block-size=1", path]); - let output = run_command_with_timeout(cmd, self.config.command_timeout_seconds).await.ok()?; + let output = run_command_with_timeout(cmd, 2).await.ok()?; if !output.status.success() { // Log permission errors for debugging but don't spam logs @@ -789,10 +786,9 @@ impl SystemdCollector { let mut containers = Vec::new(); // Check if docker is available (cm-agent user is in docker group) - // Use -a to show ALL containers (running and stopped) - let timeout_str = self.config.command_timeout_seconds.to_string(); + // Use -a to show ALL containers (running and stopped) with 3 second timeout let output = Command::new("timeout") - .args(&[&timeout_str, "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) + .args(&["3", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) .output(); let output = match output { @@ -833,10 +829,9 @@ impl SystemdCollector { /// Get docker images as sub-services fn get_docker_images(&self) -> Vec<(String, String, f32)> { let mut images = Vec::new(); - // Check if docker is available (cm-agent user is in docker group) - let timeout_str = self.config.command_timeout_seconds.to_string(); + // Check if docker is available (cm-agent user is in docker group) with 3 second timeout let output = Command::new("timeout") - .args(&[&timeout_str, "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) + .args(&["3", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) .output(); let output = match output { diff --git a/agent/src/config/mod.rs b/agent/src/config/mod.rs index f0f403a..8593b54 100644 --- a/agent/src/config/mod.rs +++ b/agent/src/config/mod.rs @@ -79,9 +79,6 @@ pub struct DiskConfig { pub temperature_critical_celsius: f32, pub wear_warning_percent: f32, pub wear_critical_percent: f32, - /// Command timeout in seconds for lsblk, smartctl, etc. - #[serde(default = "default_disk_command_timeout")] - pub command_timeout_seconds: u64, } /// Filesystem configuration entry @@ -111,9 +108,6 @@ pub struct SystemdConfig { pub http_timeout_seconds: u64, pub http_connect_timeout_seconds: u64, pub nginx_latency_critical_ms: f32, - /// Command timeout in seconds for systemctl, docker, du commands - #[serde(default = "default_systemd_command_timeout")] - pub command_timeout_seconds: u64, } @@ -138,9 +132,6 @@ pub struct BackupConfig { pub struct NetworkConfig { pub enabled: bool, pub interval_seconds: u64, - /// Command timeout in seconds for ip route, ip addr commands - #[serde(default = "default_network_command_timeout")] - pub command_timeout_seconds: u64, } /// Notification configuration @@ -154,9 +145,6 @@ pub struct NotificationConfig { pub rate_limit_minutes: u64, /// Email notification batching interval in seconds (default: 60) pub aggregation_interval_seconds: u64, - /// Status check interval in seconds for detecting changes (default: 30) - #[serde(default = "default_notification_check_interval")] - pub check_interval_seconds: u64, /// List of metric names to exclude from email notifications #[serde(default)] pub exclude_email_metrics: Vec, @@ -170,26 +158,10 @@ fn default_heartbeat_interval_seconds() -> u64 { 5 } -fn default_notification_check_interval() -> u64 { - 30 -} - fn default_maintenance_mode_file() -> String { "/tmp/cm-maintenance".to_string() } -fn default_disk_command_timeout() -> u64 { - 30 -} - -fn default_systemd_command_timeout() -> u64 { - 15 -} - -fn default_network_command_timeout() -> u64 { - 10 -} - impl AgentConfig { pub fn from_file>(path: P) -> Result { loader::load_config(path) From 317cf76bd1562dbebf16ba66447af78032fa7f6b Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 27 Nov 2025 23:16:40 +0100 Subject: [PATCH 10/14] Bump version to v0.1.196 --- Cargo.lock | 6 +++--- agent/Cargo.toml | 2 +- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 343050f..b2c955b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -278,7 +278,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.191" +version = "0.1.196" dependencies = [ "anyhow", "chrono", @@ -300,7 +300,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.191" +version = "0.1.196" dependencies = [ "anyhow", "async-trait", @@ -323,7 +323,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.191" +version = "0.1.196" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index c4f70ac..6691dad 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.192" +version = "0.1.196" edition = "2021" [dependencies] diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 5f612ec..da9e534 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.192" +version = "0.1.196" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index e0c9e6a..af42308 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.192" +version = "0.1.196" edition = "2021" [dependencies] From b444c88ea081f04cb59342e879149e66077c528e Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Fri, 28 Nov 2025 11:27:33 +0100 Subject: [PATCH 11/14] Replace external commands with native Rust APIs Significant performance improvements by eliminating subprocess spawning: - Replace 'ip' commands with rtnetlink for network interface discovery - Replace 'docker ps/images' with bollard Docker API client - Replace 'systemctl list-units' with zbus D-Bus for systemd interaction - Replace 'df' with statvfs() syscall for filesystem statistics - Replace 'lsblk' with /proc/mounts parsing Add interval-based caching to collectors: - DiskCollector now respects interval_seconds configuration - SystemdCollector now respects interval_seconds configuration - CpuCollector now respects interval_seconds configuration Remove unused command communication infrastructure: - Remove port 6131 ZMQ command receiver - Clean up unused AgentCommand types Dependencies added: - rtnetlink = "0.14" - netlink-packet-route = "0.19" - bollard = "0.17" - zbus = "4.0" - nix (fs features for statvfs) --- Cargo.lock | 1078 ++++++++++++++++++++++++++++++- agent/Cargo.toml | 17 +- agent/src/agent.rs | 40 +- agent/src/collectors/cpu.rs | 62 +- agent/src/collectors/disk.rs | 172 +++-- agent/src/collectors/memory.rs | 73 +-- agent/src/collectors/network.rs | 222 ++++--- agent/src/collectors/systemd.rs | 260 +++----- agent/src/communication/mod.rs | 47 +- agent/src/config/mod.rs | 1 - agent/src/config/validation.rs | 8 - dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 13 files changed, 1483 insertions(+), 501 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b2c955b..716aaae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,6 +103,137 @@ dependencies = [ "object", ] +[[package]] +name = "async-broadcast" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "435a87a52755b8f27fcf321ac4f04b2802e337c8c4872923137471ec39c37532" +dependencies = [ + "event-listener", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-channel" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-executor" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497c00e0fd83a72a79a39fcbd8e3e2f055d6f6c7e025f3b3d91f4f8e76527fb8" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "pin-project-lite", + "slab", +] + +[[package]] +name = "async-fs" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8034a681df4aed8b8edbd7fbe472401ecf009251c8b40556b304567052e294c5" +dependencies = [ + "async-lock", + "blocking", + "futures-lite", +] + +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite", + "parking", + "polling", + "rustix", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-lock" +version = "3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener", + "futures-lite", + "rustix", +] + +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -114,6 +245,12 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" @@ -144,12 +281,84 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel", + "async-task", + "futures-io", + "futures-lite", + "piper", +] + +[[package]] +name = "bollard" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d41711ad46fda47cd701f6908e59d1bd6b9a2b7464c0d0aeab95c6d37096ff8a" +dependencies = [ + "base64 0.22.1", + "bollard-stubs", + "bytes", + "futures-core", + "futures-util", + "hex", + "http 1.4.0", + "http-body-util", + "hyper 1.8.1", + "hyper-named-pipe", + "hyper-util", + "hyperlocal", + "log", + "pin-project-lite", + "serde", + "serde_derive", + "serde_json", + "serde_repr", + "serde_urlencoded", + "thiserror 1.0.69", + "tokio", + "tokio-util", + "tower-service", + "url", + "winapi", +] + +[[package]] +name = "bollard-stubs" +version = "1.45.0-rc.26.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d7c5415e3a6bc6d3e99eff6268e488fd4ee25e7b28c10f08fa6760bd9de16e4" +dependencies = [ + "serde", + "serde_repr", + "serde_with", +] + [[package]] name = "bumpalo" version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "bytes" version = "1.11.0" @@ -190,6 +399,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "chrono" version = "0.4.42" @@ -278,7 +493,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.196" +version = "0.1.197" dependencies = [ "anyhow", "chrono", @@ -289,7 +504,7 @@ dependencies = [ "ratatui", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "toml", "tracing", @@ -300,35 +515,42 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.196" +version = "0.1.197" dependencies = [ "anyhow", "async-trait", + "bollard", "chrono", "chrono-tz", "clap", "cm-dashboard-shared", + "futures", "gethostname", "lettre", + "libc", + "netlink-packet-route", + "nix 0.29.0", "reqwest", + "rtnetlink", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", "tokio", "toml", "tracing", "tracing-subscriber", + "zbus", "zmq", ] [[package]] name = "cm-dashboard-shared" -version = "0.1.196" +version = "0.1.197" dependencies = [ "chrono", "serde", "serde_json", - "thiserror", + "thiserror 1.0.69", ] [[package]] @@ -337,6 +559,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -353,6 +584,15 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "crossbeam" version = "0.8.4" @@ -434,6 +674,36 @@ dependencies = [ "winapi", ] +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "deranged" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "dircpy" version = "0.3.19" @@ -456,6 +726,12 @@ dependencies = [ "syn", ] +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + [[package]] name = "either" version = "1.15.0" @@ -487,6 +763,33 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "endi" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66b7e2430c6dff6a955451e2cfc438f09cea1965a9d6f87f7e3b90decc014099" + +[[package]] +name = "enumflags2" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef" +dependencies = [ + "enumflags2_derive", + "serde", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -503,6 +806,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -551,6 +875,21 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -558,6 +897,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", + "futures-sink", ] [[package]] @@ -566,12 +906,47 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -590,8 +965,11 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", "futures-io", + "futures-macro", + "futures-sink", "futures-task", "memchr", "pin-project-lite", @@ -599,6 +977,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "gethostname" version = "0.4.3" @@ -609,6 +997,17 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "getrandom" version = "0.3.4" @@ -632,14 +1031,20 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", - "indexmap", + "http 0.2.12", + "indexmap 2.12.1", "slab", "tokio", "tokio-util", "tracing", ] +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + [[package]] name = "hashbrown" version = "0.14.5" @@ -679,6 +1084,18 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "0.2.12" @@ -690,6 +1107,16 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + [[package]] name = "http-body" version = "0.4.6" @@ -697,7 +1124,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" dependencies = [ "bytes", - "http", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -724,8 +1174,8 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", "httparse", "httpdate", "itoa", @@ -737,6 +1187,43 @@ dependencies = [ "want", ] +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-named-pipe" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" +dependencies = [ + "hex", + "hyper 1.8.1", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", + "winapi", +] + [[package]] name = "hyper-tls" version = "0.5.0" @@ -744,12 +1231,48 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6183ddfa99b85da61a140bea0efc93fdf56ceaa041b37d553518030827f9905" dependencies = [ "bytes", - "hyper", + "hyper 0.14.32", "native-tls", "tokio", "tokio-native-tls", ] +[[package]] +name = "hyper-util" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52e9a2a24dc5c6821e71a7030e1e14b7b632acac55c40e9d2e082c621261bb56" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper 1.8.1", + "libc", + "pin-project-lite", + "socket2 0.6.1", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "hyperlocal" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" +dependencies = [ + "hex", + "http-body-util", + "hyper 1.8.1", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + [[package]] name = "iana-time-zone" version = "0.1.64" @@ -876,6 +1399,17 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + [[package]] name = "indexmap" version = "2.12.1" @@ -884,6 +1418,8 @@ checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2" dependencies = [ "equivalent", "hashbrown 0.16.1", + "serde", + "serde_core", ] [[package]] @@ -928,7 +1464,7 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom", + "getrandom 0.3.4", "libc", ] @@ -1037,6 +1573,15 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.17" @@ -1083,6 +1628,94 @@ dependencies = [ "tempfile", ] +[[package]] +name = "netlink-packet-core" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72724faf704479d67b388da142b186f916188505e7e0b26719019c525882eda4" +dependencies = [ + "anyhow", + "byteorder", + "netlink-packet-utils", +] + +[[package]] +name = "netlink-packet-route" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74c171cd77b4ee8c7708da746ce392440cb7bcf618d122ec9ecc607b12938bf4" +dependencies = [ + "anyhow", + "byteorder", + "libc", + "log", + "netlink-packet-core", + "netlink-packet-utils", +] + +[[package]] +name = "netlink-packet-utils" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ede8a08c71ad5a95cdd0e4e52facd37190977039a4704eb82a283f713747d34" +dependencies = [ + "anyhow", + "byteorder", + "paste", + "thiserror 1.0.69", +] + +[[package]] +name = "netlink-proto" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72452e012c2f8d612410d89eea01e2d9b56205274abb35d53f60200b2ec41d60" +dependencies = [ + "bytes", + "futures", + "log", + "netlink-packet-core", + "netlink-sys", + "thiserror 2.0.17", +] + +[[package]] +name = "netlink-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16c903aa70590cb93691bf97a767c8d1d6122d2cc9070433deb3bbf36ce8bd23" +dependencies = [ + "bytes", + "futures", + "libc", + "log", + "tokio", +] + +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "libc", +] + +[[package]] +name = "nix" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71e2746dc3a24dd78b3cfcb7be93368c6de9963d30f43a6a73998a9cf4b17b46" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "cfg_aliases", + "libc", + "memoffset", +] + [[package]] name = "nom" version = "8.0.0" @@ -1101,6 +1734,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-traits" version = "0.2.19" @@ -1175,6 +1814,22 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "ordered-stream" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aa2b01e1d916879f73a53d01d1d6cee68adbb31d6d9177a8cfce093cced1d50" +dependencies = [ + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.5" @@ -1269,12 +1924,37 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand", + "futures-io", +] + [[package]] name = "pkg-config" version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "potential_utf" version = "0.1.4" @@ -1284,6 +1964,30 @@ dependencies = [ "zerovec", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro-crate" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +dependencies = [ + "toml_edit 0.23.7", +] + [[package]] name = "proc-macro2" version = "1.0.103" @@ -1330,6 +2034,18 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", "rand_core", ] @@ -1338,6 +2054,9 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] [[package]] name = "ratatui" @@ -1386,6 +2105,26 @@ dependencies = [ "bitflags 2.10.0", ] +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "regex" version = "1.12.2" @@ -1427,9 +2166,9 @@ dependencies = [ "futures-core", "futures-util", "h2", - "http", - "http-body", - "hyper", + "http 0.2.12", + "http-body 0.4.6", + "hyper 0.14.32", "hyper-tls", "ipnet", "js-sys", @@ -1455,6 +2194,24 @@ dependencies = [ "winreg", ] +[[package]] +name = "rtnetlink" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b684475344d8df1859ddb2d395dd3dac4f8f3422a1aa0725993cb375fc5caba5" +dependencies = [ + "futures", + "log", + "netlink-packet-core", + "netlink-packet-route", + "netlink-packet-utils", + "netlink-proto", + "netlink-sys", + "nix 0.27.1", + "thiserror 1.0.69", + "tokio", +] + [[package]] name = "rustix" version = "1.1.2" @@ -1507,6 +2264,30 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9558e172d4e8533736ba97870c4b2cd63f84b382a3d6eb063da41b91cce17289" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1579,6 +2360,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "serde_spanned" version = "0.6.9" @@ -1600,6 +2392,35 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_with" +version = "3.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +dependencies = [ + "base64 0.22.1", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.12.1", + "schemars 0.9.0", + "schemars 1.1.0", + "serde_core", + "serde_json", + "time", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -1702,6 +2523,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -1805,7 +2632,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom", + "getrandom 0.3.4", "once_cell", "rustix", "windows-sys 0.61.2", @@ -1817,7 +2644,16 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" dependencies = [ - "thiserror-impl", + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +dependencies = [ + "thiserror-impl 2.0.17", ] [[package]] @@ -1831,6 +2667,17 @@ dependencies = [ "syn", ] +[[package]] +name = "thiserror-impl" +version = "2.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "thread_local" version = "1.1.9" @@ -1840,6 +2687,37 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "time" +version = "0.3.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" + +[[package]] +name = "time-macros" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -1909,8 +2787,8 @@ checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", - "toml_datetime", - "toml_edit", + "toml_datetime 0.6.11", + "toml_edit 0.22.27", ] [[package]] @@ -1922,20 +2800,50 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" +dependencies = [ + "serde_core", +] + [[package]] name = "toml_edit" version = "0.22.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ - "indexmap", + "indexmap 2.12.1", "serde", "serde_spanned", - "toml_datetime", + "toml_datetime 0.6.11", "toml_write", "winnow", ] +[[package]] +name = "toml_edit" +version = "0.23.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" +dependencies = [ + "indexmap 2.12.1", + "toml_datetime 0.7.3", + "toml_parser", + "winnow", +] + +[[package]] +name = "toml_parser" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" +dependencies = [ + "winnow", +] + [[package]] name = "toml_write" version = "0.1.2" @@ -2015,6 +2923,23 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "uds_windows" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89daebc3e6fd160ac4aa9fc8b3bf71e1f74fbf92367ae71fb83a037e8bf164b9" +dependencies = [ + "memoffset", + "tempfile", + "winapi", +] + [[package]] name = "unicode-ident" version = "1.0.22" @@ -2541,6 +3466,16 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +[[package]] +name = "xdg-home" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec1cdab258fb55c0da61328dc52c8764709b249011b2cad0454c72f0bf10a1f6" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "yoke" version = "0.8.1" @@ -2564,6 +3499,68 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zbus" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb97012beadd29e654708a0fdb4c84bc046f537aecfde2c3ee0a9e4b4d48c725" +dependencies = [ + "async-broadcast", + "async-executor", + "async-fs", + "async-io", + "async-lock", + "async-process", + "async-recursion", + "async-task", + "async-trait", + "blocking", + "enumflags2", + "event-listener", + "futures-core", + "futures-sink", + "futures-util", + "hex", + "nix 0.29.0", + "ordered-stream", + "rand", + "serde", + "serde_repr", + "sha1", + "static_assertions", + "tracing", + "uds_windows", + "windows-sys 0.52.0", + "xdg-home", + "zbus_macros", + "zbus_names", + "zvariant", +] + +[[package]] +name = "zbus_macros" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "267db9407081e90bbfa46d841d3cbc60f59c0351838c4bc65199ecd79ab1983e" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", + "zvariant_utils", +] + +[[package]] +name = "zbus_names" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9b1fef7d021261cc16cba64c351d291b715febe0fa10dc3a443ac5a5022e6c" +dependencies = [ + "serde", + "static_assertions", + "zvariant", +] + [[package]] name = "zerocopy" version = "0.8.30" @@ -2669,3 +3666,40 @@ dependencies = [ "system-deps", "zeromq-src", ] + +[[package]] +name = "zvariant" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2084290ab9a1c471c38fc524945837734fbf124487e105daec2bb57fd48c81fe" +dependencies = [ + "endi", + "enumflags2", + "serde", + "static_assertions", + "zvariant_derive", +] + +[[package]] +name = "zvariant_derive" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73e2ba546bda683a90652bac4a279bc146adad1386f25379cf73200d2002c449" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", + "zvariant_utils", +] + +[[package]] +name = "zvariant_utils" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c51bcff7cc3dbb5055396bcf774748c3dab426b4b8659046963523cee4808340" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 6691dad..268a9d4 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.196" +version = "0.1.197" edition = "2021" [dependencies] @@ -20,4 +20,17 @@ gethostname = { workspace = true } chrono-tz = "0.8" toml = { workspace = true } async-trait = "0.1" -reqwest = { version = "0.11", features = ["json", "blocking"] } \ No newline at end of file +reqwest = { version = "0.11", features = ["json", "blocking"] } + +# Native system APIs +nix = { version = "0.29", features = ["fs"] } +rtnetlink = "0.14" +netlink-packet-route = "0.19" +futures = "0.3" +libc = "0.2" + +# Docker API client +bollard = "0.17" + +# D-Bus client for systemd +zbus = "4.0" \ No newline at end of file diff --git a/agent/src/agent.rs b/agent/src/agent.rs index d74a4c7..e6eeb47 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -4,7 +4,7 @@ use std::time::Duration; use tokio::time::interval; use tracing::{debug, error, info}; -use crate::communication::{AgentCommand, ZmqHandler}; +use crate::communication::ZmqHandler; use crate::config::AgentConfig; use crate::collectors::{ Collector, @@ -134,12 +134,6 @@ impl Agent { // NOTE: With structured data, we might need to implement status tracking differently // For now, we skip this until status evaluation is migrated } - // Handle incoming commands (check periodically) - _ = tokio::time::sleep(Duration::from_millis(100)) => { - if let Err(e) = self.handle_commands().await { - error!("Error handling commands: {}", e); - } - } _ = &mut shutdown_rx => { info!("Shutdown signal received, stopping agent loop"); break; @@ -259,36 +253,4 @@ impl Agent { Ok(()) } - /// Handle incoming commands from dashboard - async fn handle_commands(&mut self) -> Result<()> { - // Try to receive a command (non-blocking) - if let Ok(Some(command)) = self.zmq_handler.try_receive_command() { - info!("Received command: {:?}", command); - - match command { - AgentCommand::CollectNow => { - info!("Received immediate collection request"); - if let Err(e) = self.collect_and_broadcast().await { - error!("Failed to collect on demand: {}", e); - } - } - AgentCommand::SetInterval { seconds } => { - info!("Received interval change request: {}s", seconds); - // Note: This would require more complex handling to update the interval - // For now, just acknowledge - } - AgentCommand::ToggleCollector { name, enabled } => { - info!("Received collector toggle request: {} -> {}", name, enabled); - // Note: This would require more complex handling to enable/disable collectors - // For now, just acknowledge - } - AgentCommand::Ping => { - info!("Received ping command"); - // Maybe send back a pong or status - } - } - } - Ok(()) - } - } \ No newline at end of file diff --git a/agent/src/collectors/cpu.rs b/agent/src/collectors/cpu.rs index 6c870de..6d86c76 100644 --- a/agent/src/collectors/cpu.rs +++ b/agent/src/collectors/cpu.rs @@ -1,22 +1,25 @@ use async_trait::async_trait; -use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds}; +use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds, CpuData}; +use std::sync::RwLock; +use std::time::Instant; use tracing::debug; use super::{utils, Collector, CollectorError}; use crate::config::CpuConfig; -/// Extremely efficient CPU metrics collector -/// -/// EFFICIENCY OPTIMIZATIONS: -/// - Single /proc/loadavg read for all load metrics -/// - Single /proc/stat read for CPU usage -/// - Minimal string allocations -/// - No process spawning -/// - <0.1ms collection time target +/// Extremely efficient CPU metrics collector with interval-based caching pub struct CpuCollector { load_thresholds: HysteresisThresholds, temperature_thresholds: HysteresisThresholds, + config: CpuConfig, + state: RwLock, +} + +#[derive(Debug, Clone)] +struct CpuCacheState { + last_collection: Option, + cached_data: CpuData, } impl CpuCollector { @@ -26,15 +29,39 @@ impl CpuCollector { config.load_warning_threshold, config.load_critical_threshold, ); - + let temperature_thresholds = HysteresisThresholds::new( config.temperature_warning_threshold, config.temperature_critical_threshold, ); - + Self { load_thresholds, temperature_thresholds, + config, + state: RwLock::new(CpuCacheState { + last_collection: None, + cached_data: CpuData { + load_1min: 0.0, + load_5min: 0.0, + load_15min: 0.0, + frequency_mhz: 0.0, + temperature_celsius: None, + load_status: Status::Unknown, + temperature_status: Status::Unknown, + }, + }), + } + } + + fn should_update_cache(&self) -> bool { + let state = self.state.read().unwrap(); + match state.last_collection { + None => true, + Some(last) => { + let cache_duration = std::time::Duration::from_secs(self.config.interval_seconds); + last.elapsed() > cache_duration + } } } @@ -156,6 +183,14 @@ impl CpuCollector { #[async_trait] impl Collector for CpuCollector { async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { + // Check if cache is valid + if !self.should_update_cache() { + let state = self.state.read().unwrap(); + agent_data.system.cpu = state.cached_data.clone(); + debug!("Using cached CPU data (interval: {}s)", self.config.interval_seconds); + return Ok(()); + } + debug!("Collecting CPU metrics"); let start = std::time::Instant::now(); @@ -187,6 +222,11 @@ impl Collector for CpuCollector { Status::Unknown }; + // Update cache + let mut state = self.state.write().unwrap(); + state.last_collection = Some(Instant::now()); + state.cached_data = agent_data.system.cpu.clone(); + Ok(()) } } diff --git a/agent/src/collectors/disk.rs b/agent/src/collectors/disk.rs index 71c53cf..31cc945 100644 --- a/agent/src/collectors/disk.rs +++ b/agent/src/collectors/disk.rs @@ -6,6 +6,7 @@ use crate::config::DiskConfig; use std::process::Command; use std::time::Instant; use std::collections::HashMap; +use std::sync::RwLock; use tracing::debug; use super::{Collector, CollectorError}; @@ -14,6 +15,19 @@ use super::{Collector, CollectorError}; pub struct DiskCollector { config: DiskConfig, temperature_thresholds: HysteresisThresholds, + /// Cached state with thread-safe interior mutability + state: RwLock, +} + +/// Internal state for disk caching +#[derive(Debug, Clone)] +struct DiskCacheState { + /// Last collection time for performance tracking + last_collection: Option, + /// Cached drive data + cached_drives: Vec, + /// Cached pool data + cached_pools: Vec, } /// A physical drive with its filesystems @@ -58,10 +72,17 @@ impl DiskCollector { config.temperature_warning_celsius, config.temperature_critical_celsius, ); - + + let state = DiskCacheState { + last_collection: None, + cached_drives: Vec::new(), + cached_pools: Vec::new(), + }; + Self { config, temperature_thresholds, + state: RwLock::new(state), } } @@ -104,40 +125,70 @@ impl DiskCollector { self.populate_drives_data(&physical_drives, &smart_data, agent_data)?; self.populate_pools_data(&mergerfs_pools, &smart_data, agent_data)?; + // Step 7: Update cache with fresh data + { + let mut state = self.state.write().unwrap(); + state.cached_drives = agent_data.system.storage.drives.clone(); + state.cached_pools = agent_data.system.storage.pools.clone(); + state.last_collection = Some(Instant::now()); + } + let elapsed = start_time.elapsed(); debug!("Storage collection completed in {:?}", elapsed); Ok(()) } - /// Get block devices and their mount points using lsblk + /// Check if disk collection cache should be updated + fn should_update_cache(&self) -> bool { + let state = self.state.read().unwrap(); + + match state.last_collection { + None => true, + Some(last) => { + let cache_duration = std::time::Duration::from_secs(self.config.interval_seconds); + last.elapsed() > cache_duration + } + } + } + + /// Get cached disk data if available and fresh + fn get_cached_data(&self) -> Option<(Vec, Vec)> { + if !self.should_update_cache() { + let state = self.state.read().unwrap(); + Some((state.cached_drives.clone(), state.cached_pools.clone())) + } else { + None + } + } + + /// Get block devices and their mount points by reading /proc/mounts async fn get_mount_devices(&self) -> Result, CollectorError> { - use super::run_command_with_timeout; - - let mut cmd = Command::new("lsblk"); - cmd.args(&["-rn", "-o", "NAME,MOUNTPOINT"]); - - let output = run_command_with_timeout(cmd, 2).await + let content = std::fs::read_to_string("/proc/mounts") .map_err(|e| CollectorError::SystemRead { - path: "block devices".to_string(), + path: "/proc/mounts".to_string(), error: e.to_string(), })?; let mut mount_devices = HashMap::new(); - for line in String::from_utf8_lossy(&output.stdout).lines() { + + for line in content.lines() { let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 2 { - let device_name = parts[0]; + if parts.len() >= 3 { + let device = parts[0]; let mount_point = parts[1]; - - // Skip swap partitions and unmounted devices - if mount_point == "[SWAP]" || mount_point.is_empty() { + let fs_type = parts[2]; + + // Skip pseudo filesystems and fuse mounts + if fs_type.starts_with("fuse") || + matches!(fs_type, "proc" | "sysfs" | "tmpfs" | "devtmpfs" | + "devpts" | "cgroup" | "cgroup2" | "pstore" | "bpf" | + "tracefs" | "debugfs" | "securityfs" | "hugetlbfs" | + "mqueue" | "configfs" | "autofs") { continue; } - - // Convert device name to full path - let device_path = format!("/dev/{}", device_name); - mount_devices.insert(mount_point.to_string(), device_path); + + mount_devices.insert(mount_point.to_string(), device.to_string()); } } @@ -187,44 +238,20 @@ impl DiskCollector { Ok(()) } - /// Get filesystem info for a single mount point + /// Get filesystem info for a single mount point using statvfs syscall fn get_filesystem_info(&self, mount_point: &str) -> Result<(u64, u64), CollectorError> { - let output = std::process::Command::new("timeout") - .args(&["2", "df", "--block-size=1", mount_point]) - .output() - .map_err(|e| CollectorError::SystemRead { - path: format!("df {}", mount_point), - error: e.to_string(), - })?; + use nix::sys::statvfs::statvfs; - let output_str = String::from_utf8_lossy(&output.stdout); - let lines: Vec<&str> = output_str.lines().collect(); - - if lines.len() < 2 { - return Err(CollectorError::Parse { - value: output_str.to_string(), - error: "Expected at least 2 lines from df output".to_string(), - }); - } - - // Parse the data line (skip header) - let parts: Vec<&str> = lines[1].split_whitespace().collect(); - if parts.len() < 4 { - return Err(CollectorError::Parse { - value: lines[1].to_string(), - error: "Expected at least 4 fields in df output".to_string(), - }); - } - - let total_bytes: u64 = parts[1].parse().map_err(|e| CollectorError::Parse { - value: parts[1].to_string(), - error: format!("Failed to parse total bytes: {}", e), + let stat = statvfs(mount_point).map_err(|e| CollectorError::SystemRead { + path: mount_point.to_string(), + error: format!("statvfs failed: {}", e), })?; - let used_bytes: u64 = parts[2].parse().map_err(|e| CollectorError::Parse { - value: parts[2].to_string(), - error: format!("Failed to parse used bytes: {}", e), - })?; + // Calculate total and used bytes + let block_size = stat.fragment_size() as u64; + let total_bytes = stat.blocks() as u64 * block_size; + let available_bytes = stat.blocks_available() as u64 * block_size; + let used_bytes = total_bytes - available_bytes; Ok((total_bytes, used_bytes)) } @@ -760,32 +787,29 @@ impl DiskCollector { Ok((data_drives, parity_drives)) } - /// Get drive information for a mount path + /// Get drive information for a mount path by reading /proc/mounts fn get_drive_info_for_path(&self, path: &str) -> anyhow::Result { - // Use lsblk to find the backing device with timeout - let output = Command::new("timeout") - .args(&["2", "lsblk", "-rn", "-o", "NAME,MOUNTPOINT"]) - .output() - .map_err(|e| anyhow::anyhow!("Failed to run lsblk: {}", e))?; - - let output_str = String::from_utf8_lossy(&output.stdout); + // Read /proc/mounts to find the backing device + let content = std::fs::read_to_string("/proc/mounts") + .map_err(|e| anyhow::anyhow!("Failed to read /proc/mounts: {}", e))?; + let mut device = String::new(); - - for line in output_str.lines() { + + for line in content.lines() { let parts: Vec<&str> = line.split_whitespace().collect(); if parts.len() >= 2 && parts[1] == path { device = parts[0].to_string(); break; } } - + if device.is_empty() { return Err(anyhow::anyhow!("Could not find device for path {}", path)); } - - // Extract base device name (e.g., "sda1" -> "sda") - let base_device = self.extract_base_device(&format!("/dev/{}", device)); - + + // Extract base device name (e.g., "/dev/sda1" -> "sda") + let base_device = self.extract_base_device(&device); + // Get temperature from SMART data if available let temperature = if let Ok(smart_data) = tokio::task::block_in_place(|| { tokio::runtime::Handle::current().block_on(self.get_smart_data(&base_device)) @@ -794,7 +818,7 @@ impl DiskCollector { } else { None }; - + Ok(PoolDrive { name: base_device, mount_point: path.to_string(), @@ -838,7 +862,15 @@ impl DiskCollector { #[async_trait] impl Collector for DiskCollector { async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { - self.collect_storage_data(agent_data).await + // Use cached data if available and fresh + if let Some((cached_drives, cached_pools)) = self.get_cached_data() { + agent_data.system.storage.drives = cached_drives; + agent_data.system.storage.pools = cached_pools; + Ok(()) + } else { + // Collect fresh data + self.collect_storage_data(agent_data).await + } } } diff --git a/agent/src/collectors/memory.rs b/agent/src/collectors/memory.rs index e186704..03d613e 100644 --- a/agent/src/collectors/memory.rs +++ b/agent/src/collectors/memory.rs @@ -95,62 +95,47 @@ impl MemoryCollector { Ok(()) } - /// Populate tmpfs data into AgentData + /// Populate tmpfs data into AgentData using statvfs syscall async fn populate_tmpfs_data(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> { + use nix::sys::statvfs::statvfs; + // Discover all tmpfs mount points let tmpfs_mounts = self.discover_tmpfs_mounts()?; - + if tmpfs_mounts.is_empty() { debug!("No tmpfs mounts found to monitor"); return Ok(()); } - // Get usage data for all tmpfs mounts at once using df (with 2 second timeout) - let mut df_args = vec!["2", "df", "--output=target,size,used", "--block-size=1"]; - df_args.extend(tmpfs_mounts.iter().map(|s| s.as_str())); + // Get usage data for each tmpfs mount using statvfs syscall + for mount_point in tmpfs_mounts { + match statvfs(mount_point.as_str()) { + Ok(stat) => { + let block_size = stat.fragment_size() as u64; + let total_bytes = stat.blocks() as u64 * block_size; + let available_bytes = stat.blocks_available() as u64 * block_size; + let used_bytes = total_bytes - available_bytes; - let df_output = std::process::Command::new("timeout") - .args(&df_args[..]) - .output() - .map_err(|e| CollectorError::SystemRead { - path: "tmpfs mounts".to_string(), - error: e.to_string(), - })?; + if total_bytes == 0 { + continue; + } - let df_str = String::from_utf8_lossy(&df_output.stdout); - let df_lines: Vec<&str> = df_str.lines().skip(1).collect(); // Skip header + let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0); + let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0); + let usage_percent = (used_bytes as f32 / total_bytes as f32) * 100.0; - // Process each tmpfs mount - for (i, mount_point) in tmpfs_mounts.iter().enumerate() { - if i >= df_lines.len() { - debug!("Not enough df output lines for tmpfs mount: {}", mount_point); - continue; + // Add to tmpfs list + agent_data.system.memory.tmpfs.push(TmpfsData { + mount: mount_point.clone(), + usage_percent, + used_gb, + total_gb, + }); + } + Err(e) => { + debug!("Failed to get stats for tmpfs mount {}: {}", mount_point, e); + } } - - let parts: Vec<&str> = df_lines[i].split_whitespace().collect(); - if parts.len() < 3 { - debug!("Invalid df output for tmpfs mount: {}", mount_point); - continue; - } - - let total_bytes: u64 = parts[1].parse().unwrap_or(0); - let used_bytes: u64 = parts[2].parse().unwrap_or(0); - - if total_bytes == 0 { - continue; - } - - let total_gb = total_bytes as f32 / (1024.0 * 1024.0 * 1024.0); - let used_gb = used_bytes as f32 / (1024.0 * 1024.0 * 1024.0); - let usage_percent = (used_bytes as f32 / total_bytes as f32) * 100.0; - - // Add to tmpfs list - agent_data.system.memory.tmpfs.push(TmpfsData { - mount: mount_point.clone(), - usage_percent, - used_gb, - total_gb, - }); } // Sort tmpfs mounts by mount point for consistent display order diff --git a/agent/src/collectors/network.rs b/agent/src/collectors/network.rs index fd4dbe2..302cb89 100644 --- a/agent/src/collectors/network.rs +++ b/agent/src/collectors/network.rs @@ -1,7 +1,12 @@ use async_trait::async_trait; use cm_dashboard_shared::{AgentData, NetworkInterfaceData, Status}; -use std::process::Command; use tracing::debug; +use futures::stream::TryStreamExt; +use rtnetlink::{new_connection, IpVersion}; +use netlink_packet_route::link::LinkAttribute; +use netlink_packet_route::address::AddressAttribute; +use netlink_packet_route::route::RouteAttribute; +use std::net::IpAddr; use super::{Collector, CollectorError}; use crate::config::NetworkConfig; @@ -49,36 +54,52 @@ impl NetworkCollector { } } - /// Get the primary physical interface (the one with default route) - fn get_primary_physical_interface() -> Option { - match Command::new("timeout").args(["2", "ip", "route", "show", "default"]).output() { - Ok(output) if output.status.success() => { - let output_str = String::from_utf8_lossy(&output.stdout); - // Parse: "default via 192.168.1.1 dev eno1 ..." - for line in output_str.lines() { - if line.starts_with("default") { - if let Some(dev_pos) = line.find(" dev ") { - let after_dev = &line[dev_pos + 5..]; - if let Some(space_pos) = after_dev.find(' ') { - let interface = &after_dev[..space_pos]; - // Only return if it's a physical interface - if Self::is_physical_interface(interface) { - return Some(interface.to_string()); - } + /// Get the primary physical interface (the one with default route) using rtnetlink + async fn get_primary_physical_interface() -> Option { + let (connection, handle, _) = match new_connection() { + Ok(conn) => conn, + Err(e) => { + debug!("Failed to create netlink connection: {}", e); + return None; + } + }; + + tokio::spawn(connection); + + // Get default route + let mut routes = handle.route().get(IpVersion::V4).execute(); + + while let Ok(Some(route)) = routes.try_next().await { + // Check if this is a default route (destination is 0.0.0.0/0) + if route.header.destination_prefix_length == 0 { + // Find the output interface (OIF) attribute + if let Some(oif) = route.attributes.iter().find_map(|attr| { + if let RouteAttribute::Oif(index) = attr { + Some(*index) + } else { + None + } + }) { + // Get interface name from index + let mut link = handle.link().get().match_index(oif).execute(); + if let Ok(Some(link_msg)) = link.try_next().await { + if let Some(name) = link_msg.attributes.iter().find_map(|attr| { + if let LinkAttribute::IfName(n) = attr { + Some(n.to_string()) } else { - // No space after interface name (end of line) - let interface = after_dev.trim(); - if Self::is_physical_interface(interface) { - return Some(interface.to_string()); - } + None + } + }) { + if Self::is_physical_interface(&name) { + return Some(name); } } } } - None } - _ => None, } + + None } /// Parse VLAN configuration from /proc/net/vlan/config @@ -103,102 +124,105 @@ impl NetworkCollector { vlan_map } - /// Collect network interfaces using ip command + /// Collect network interfaces using rtnetlink async fn collect_interfaces(&self) -> Vec { let mut interfaces = Vec::new(); // Parse VLAN configuration let vlan_map = Self::parse_vlan_config(); - match Command::new("timeout").args(["2", "ip", "-j", "addr"]).output() { - Ok(output) if output.status.success() => { - let json_str = String::from_utf8_lossy(&output.stdout); + // Create netlink connection + let (connection, handle, _) = match new_connection() { + Ok(conn) => conn, + Err(e) => { + debug!("Failed to create netlink connection: {}", e); + return interfaces; + } + }; - if let Ok(json_data) = serde_json::from_str::(&json_str) { - if let Some(ifaces) = json_data.as_array() { - for iface in ifaces { - let name = iface["ifname"].as_str().unwrap_or("").to_string(); + tokio::spawn(connection); - // Skip loopback, empty names, and ifb* interfaces - if name.is_empty() || name == "lo" || name.starts_with("ifb") { - continue; - } + // Get all links + let mut links = handle.link().get().execute(); - // Parse parent interface from @parent notation (e.g., lan@enp0s31f6) - let (interface_name, parent_interface) = if let Some(at_pos) = name.find('@') { - let (child, parent) = name.split_at(at_pos); - (child.to_string(), Some(parent[1..].to_string())) - } else { - (name.clone(), None) - }; + while let Ok(Some(link)) = links.try_next().await { + // Get interface name + let name = match link.attributes.iter().find_map(|attr| { + if let LinkAttribute::IfName(n) = attr { + Some(n.to_string()) + } else { + None + } + }) { + Some(n) => n, + None => continue, + }; - let mut ipv4_addresses = Vec::new(); - let mut ipv6_addresses = Vec::new(); + // Skip loopback and ifb interfaces + if name == "lo" || name.starts_with("ifb") { + continue; + } - // Extract IP addresses - if let Some(addr_info) = iface["addr_info"].as_array() { - for addr in addr_info { - if let Some(family) = addr["family"].as_str() { - if let Some(local) = addr["local"].as_str() { - match family { - "inet" => ipv4_addresses.push(local.to_string()), - "inet6" => { - // Skip link-local IPv6 addresses (fe80::) - if !local.starts_with("fe80:") { - ipv6_addresses.push(local.to_string()); - } - } - _ => {} - } - } - } + // Parse parent interface from @parent notation (e.g., lan@enp0s31f6) + let (interface_name, parent_interface) = if let Some(at_pos) = name.find('@') { + let (child, parent) = name.split_at(at_pos); + (child.to_string(), Some(parent[1..].to_string())) + } else { + (name.clone(), None) + }; + + // Get IP addresses for this interface + let mut ipv4_addresses = Vec::new(); + let mut ipv6_addresses = Vec::new(); + + let mut addrs = handle.address().get().set_link_index_filter(link.header.index).execute(); + while let Ok(Some(addr)) = addrs.try_next().await { + for nla in &addr.attributes { + if let AddressAttribute::Address(ip) = nla { + match ip { + IpAddr::V4(ipv4) => ipv4_addresses.push(ipv4.to_string()), + IpAddr::V6(ipv6) => { + // Skip link-local IPv6 addresses (fe80::) + if !ipv6.to_string().starts_with("fe80:") { + ipv6_addresses.push(ipv6.to_string()); } } - - // Determine if physical and get status - let is_physical = Self::is_physical_interface(&interface_name); - - // Only filter out virtual interfaces without IPs - // Physical interfaces should always be shown even if down/no IPs - if !is_physical && ipv4_addresses.is_empty() && ipv6_addresses.is_empty() { - continue; - } - - let link_status = if is_physical { - Self::get_link_status(&name) - } else { - Status::Unknown // Virtual interfaces don't have meaningful link status - }; - - // Look up VLAN ID from the map (use original name before @ parsing) - let vlan_id = vlan_map.get(&name).copied(); - - interfaces.push(NetworkInterfaceData { - name: interface_name, - ipv4_addresses, - ipv6_addresses, - is_physical, - link_status, - parent_interface, - vlan_id, - }); } } } } - Err(e) => { - debug!("Failed to execute ip command: {}", e); - } - Ok(output) => { - debug!("ip command failed with status: {}", output.status); + + // Determine if physical + let is_physical = Self::is_physical_interface(&interface_name); + + // Only filter out virtual interfaces without IPs + if !is_physical && ipv4_addresses.is_empty() && ipv6_addresses.is_empty() { + continue; } + + let link_status = if is_physical { + Self::get_link_status(&name) + } else { + Status::Unknown + }; + + // Look up VLAN ID + let vlan_id = vlan_map.get(&name).copied(); + + interfaces.push(NetworkInterfaceData { + name: interface_name, + ipv4_addresses, + ipv6_addresses, + is_physical, + link_status, + parent_interface, + vlan_id, + }); } - // Assign primary physical interface as parent to virtual interfaces without explicit parent - let primary_interface = Self::get_primary_physical_interface(); - if let Some(primary) = primary_interface { + // Assign primary physical interface as parent to virtual interfaces + if let Some(primary) = Self::get_primary_physical_interface().await { for interface in interfaces.iter_mut() { - // Only assign parent to virtual interfaces that don't already have one if !interface.is_physical && interface.parent_interface.is_none() { interface.parent_interface = Some(primary.clone()); } diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index bcaa6be..9affb64 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -5,6 +5,9 @@ use std::process::Command; use std::sync::RwLock; use std::time::Instant; use tracing::{debug, warn}; +use bollard::Docker; +use bollard::container::ListContainersOptions; +use zbus::Connection; use super::{Collector, CollectorError}; use crate::config::SystemdConfig; @@ -74,7 +77,7 @@ impl SystemdCollector { debug!("Collecting systemd services metrics"); // Get cached services (discovery only happens when needed) - let monitored_services = match self.get_monitored_services() { + let monitored_services = match self.get_monitored_services().await { Ok(services) => services, Err(e) => { debug!("Failed to get monitored services: {}", e); @@ -119,7 +122,7 @@ impl SystemdCollector { } if service_name.contains("docker") && active_status == "active" { - let docker_containers = self.get_docker_containers(); + let docker_containers = self.get_docker_containers().await; for (container_name, container_status) in docker_containers { // For now, docker containers have no additional metrics // Future: could add memory_mb, cpu_percent, restart_count, etc. @@ -134,7 +137,7 @@ impl SystemdCollector { } // Add Docker images - let docker_images = self.get_docker_images(); + let docker_images = self.get_docker_images().await; for (image_name, image_status, image_size_mb) in docker_images { let mut metrics = Vec::new(); metrics.push(SubServiceMetric { @@ -190,7 +193,7 @@ impl SystemdCollector { } /// Get monitored services, discovering them if needed or cache is expired - fn get_monitored_services(&self) -> Result> { + async fn get_monitored_services(&self) -> Result> { // Check if we need discovery without holding the lock let needs_discovery = { let state = self.state.read().unwrap(); @@ -205,7 +208,7 @@ impl SystemdCollector { if needs_discovery { debug!("Discovering systemd services (cache expired or first run)"); - match self.discover_services_internal() { + match self.discover_services_internal().await { Ok((services, status_cache)) => { if let Ok(mut state) = self.state.write() { state.monitored_services = services.clone(); @@ -252,72 +255,46 @@ impl SystemdCollector { state.nginx_site_metrics.clone() } - /// Auto-discover interesting services to monitor - fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { - // First: Get all service unit files (with 3 second timeout) - let unit_files_output = Command::new("timeout") - .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) - .output()?; + /// Auto-discover interesting services to monitor using D-Bus + async fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { + // Connect to system D-Bus + let connection = Connection::system().await?; - if !unit_files_output.status.success() { - return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); + // Get systemd manager proxy + let proxy = zbus::Proxy::new( + &connection, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + ).await?; + + // List all units via D-Bus + let units: Vec<(String, String, String, String, String, String, zbus::zvariant::OwnedObjectPath, u32, String, zbus::zvariant::OwnedObjectPath)> = + proxy.call("ListUnits", &()).await?; + + let mut all_service_names = std::collections::HashSet::new(); + let mut service_status_cache = std::collections::HashMap::new(); + + // Parse D-Bus response for services only + for unit in units { + let (unit_name, _description, load_state, active_state, sub_state, _followed, _unit_path, _job_id, _job_type, _job_path) = unit; + + if unit_name.ends_with(".service") { + let service_name = unit_name.trim_end_matches(".service"); + all_service_names.insert(service_name.to_string()); + + service_status_cache.insert(service_name.to_string(), ServiceStatusInfo { + load_state: load_state.clone(), + active_state: active_state.clone(), + sub_state: sub_state.clone(), + }); + } } - // Second: Get runtime status of all units (with 3 second timeout) - let units_status_output = Command::new("timeout") - .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) - .output()?; - - if !units_status_output.status.success() { - return Err(anyhow::anyhow!("systemctl list-units command failed")); - } - - let unit_files_str = String::from_utf8(unit_files_output.stdout)?; - let units_status_str = String::from_utf8(units_status_output.stdout)?; let mut services = Vec::new(); - let excluded_services = &self.config.excluded_services; let service_name_filters = &self.config.service_name_filters; - // Parse all service unit files - let mut all_service_names = std::collections::HashSet::new(); - for line in unit_files_str.lines() { - let fields: Vec<&str> = line.split_whitespace().collect(); - if fields.len() >= 2 && fields[0].ends_with(".service") { - let service_name = fields[0].trim_end_matches(".service"); - all_service_names.insert(service_name.to_string()); - } - } - - // Parse runtime status for all units - let mut status_cache = std::collections::HashMap::new(); - for line in units_status_str.lines() { - let fields: Vec<&str> = line.split_whitespace().collect(); - if fields.len() >= 4 && fields[0].ends_with(".service") { - let service_name = fields[0].trim_end_matches(".service"); - let load_state = fields.get(1).unwrap_or(&"unknown").to_string(); - let active_state = fields.get(2).unwrap_or(&"unknown").to_string(); - let sub_state = fields.get(3).unwrap_or(&"unknown").to_string(); - - status_cache.insert(service_name.to_string(), ServiceStatusInfo { - load_state, - active_state, - sub_state, - }); - } - } - - // For services found in unit files but not in runtime status, set default inactive status - for service_name in &all_service_names { - if !status_cache.contains_key(service_name) { - status_cache.insert(service_name.to_string(), ServiceStatusInfo { - load_state: "not-loaded".to_string(), - active_state: "inactive".to_string(), - sub_state: "dead".to_string(), - }); - } - } - // Process all discovered services and apply filters for service_name in &all_service_names { // Skip excluded services first @@ -342,7 +319,7 @@ impl SystemdCollector { } } - Ok((services, status_cache)) + Ok((services, service_status_cache)) } /// Get service status from cache (if available) or fallback to systemctl @@ -541,7 +518,7 @@ impl SystemdCollector { match state.last_collection { None => true, Some(last) => { - let cache_duration = std::time::Duration::from_secs(30); + let cache_duration = std::time::Duration::from_secs(self.config.interval_seconds); last.elapsed() > cache_duration } } @@ -781,94 +758,91 @@ impl SystemdCollector { } } - /// Get docker containers as sub-services - fn get_docker_containers(&self) -> Vec<(String, String)> { + /// Get docker containers as sub-services using bollard API + async fn get_docker_containers(&self) -> Vec<(String, String)> { let mut containers = Vec::new(); - // Check if docker is available (cm-agent user is in docker group) - // Use -a to show ALL containers (running and stopped) with 3 second timeout - let output = Command::new("timeout") - .args(&["3", "docker", "ps", "-a", "--format", "{{.Names}},{{.Status}}"]) - .output(); - - let output = match output { - Ok(out) if out.status.success() => out, - _ => return containers, // Docker not available or failed + // Connect to Docker daemon + let docker = match Docker::connect_with_local_defaults() { + Ok(d) => d, + Err(e) => { + debug!("Failed to connect to Docker daemon: {}", e); + return containers; + } }; - let output_str = match String::from_utf8(output.stdout) { - Ok(s) => s, - Err(_) => return containers, + // List all containers (running and stopped) + let list_options = Some(ListContainersOptions:: { + all: true, + ..Default::default() + }); + + let container_list = match docker.list_containers(list_options).await { + Ok(list) => list, + Err(e) => { + debug!("Failed to list Docker containers: {}", e); + return containers; + } }; - for line in output_str.lines() { - if line.trim().is_empty() { - continue; - } + for container in container_list { + // Get container name (remove leading slash if present) + let container_name = container.names + .and_then(|names| names.first().map(|n| n.trim_start_matches('/').to_string())) + .unwrap_or_else(|| container.id.clone().unwrap_or_default()); - let parts: Vec<&str> = line.split(',').collect(); - if parts.len() >= 2 { - let container_name = parts[0].trim(); - let status_str = parts[1].trim(); + // Map container state to service status + let container_status = match container.state.as_deref() { + Some("running") => "active", + Some("exited") | Some("created") => "inactive", + _ => "failed", // restarting, paused, dead, etc. + }; - let container_status = if status_str.contains("Up") { - "active" - } else if status_str.contains("Exited") || status_str.contains("Created") { - "inactive" // Stopped/created containers are inactive - } else { - "failed" // Other states (restarting, paused, dead) → failed - }; - - containers.push((format!("docker_{}", container_name), container_status.to_string())); - } + containers.push((format!("docker_{}", container_name), container_status.to_string())); } containers } - /// Get docker images as sub-services - fn get_docker_images(&self) -> Vec<(String, String, f32)> { + /// Get docker images as sub-services using bollard API + async fn get_docker_images(&self) -> Vec<(String, String, f32)> { let mut images = Vec::new(); - // Check if docker is available (cm-agent user is in docker group) with 3 second timeout - let output = Command::new("timeout") - .args(&["3", "docker", "images", "--format", "{{.Repository}}:{{.Tag}},{{.Size}}"]) - .output(); - let output = match output { - Ok(out) if out.status.success() => out, - Ok(_) => { - return images; - } - Err(_) => { + // Connect to Docker daemon + let docker = match Docker::connect_with_local_defaults() { + Ok(d) => d, + Err(e) => { + debug!("Failed to connect to Docker daemon: {}", e); return images; } }; - let output_str = match String::from_utf8(output.stdout) { - Ok(s) => s, - Err(_) => return images, + // List all images + let image_list = match docker.list_images::(None).await { + Ok(list) => list, + Err(e) => { + debug!("Failed to list Docker images: {}", e); + return images; + } }; - for line in output_str.lines() { - if line.trim().is_empty() { - continue; + for image in image_list { + // Get image name from repo tags + let image_names: Vec = image.repo_tags + .into_iter() + .filter(|tag| !tag.contains("")) + .collect(); + + if image_names.is_empty() { + continue; // Skip untagged images } - let parts: Vec<&str> = line.split(',').collect(); - if parts.len() >= 2 { - let image_name = parts[0].trim(); - let size_str = parts[1].trim(); - - // Skip : images (dangling images) - if image_name.contains("") { - continue; - } - - // Parse size to MB (sizes come as "142MB", "1.5GB", "512kB", etc.) - let size_mb = self.parse_docker_size(size_str); + // Get size in MB + let size_mb = image.size as f32 / (1024.0 * 1024.0); + for image_name in image_names { images.push(( - image_name.to_string(), + image_name, "inactive".to_string(), // Images are informational - use inactive for neutral display size_mb )); @@ -877,34 +851,6 @@ impl SystemdCollector { images } - - /// Parse Docker size string to MB - fn parse_docker_size(&self, size_str: &str) -> f32 { - let size_upper = size_str.to_uppercase(); - - // Extract numeric part and unit - let mut num_str = String::new(); - let mut unit = String::new(); - - for ch in size_upper.chars() { - if ch.is_ascii_digit() || ch == '.' { - num_str.push(ch); - } else if ch.is_alphabetic() { - unit.push(ch); - } - } - - let value: f32 = num_str.parse().unwrap_or(0.0); - - // Convert to MB - match unit.as_str() { - "KB" | "K" => value / 1024.0, - "MB" | "M" => value, - "GB" | "G" => value * 1024.0, - "TB" | "T" => value * 1024.0 * 1024.0, - _ => value, // Assume bytes if no unit - } - } } #[async_trait] diff --git a/agent/src/communication/mod.rs b/agent/src/communication/mod.rs index c364f7c..af24350 100644 --- a/agent/src/communication/mod.rs +++ b/agent/src/communication/mod.rs @@ -5,10 +5,9 @@ use zmq::{Context, Socket, SocketType}; use crate::config::ZmqConfig; -/// ZMQ communication handler for publishing metrics and receiving commands +/// ZMQ communication handler for publishing metrics pub struct ZmqHandler { publisher: Socket, - command_receiver: Socket, } impl ZmqHandler { @@ -26,20 +25,8 @@ impl ZmqHandler { publisher.set_sndhwm(1000)?; // High water mark for outbound messages publisher.set_linger(1000)?; // Linger time on close - // Create command receiver socket (PULL socket to receive commands from dashboard) - let command_receiver = context.socket(SocketType::PULL)?; - let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port); - command_receiver.bind(&cmd_bind_address)?; - - info!("ZMQ command receiver bound to {}", cmd_bind_address); - - // Set non-blocking mode for command receiver - command_receiver.set_rcvtimeo(0)?; // Non-blocking receive - command_receiver.set_linger(1000)?; - Ok(Self { publisher, - command_receiver, }) } @@ -65,36 +52,4 @@ impl ZmqHandler { Ok(()) } - /// Try to receive a command (non-blocking) - pub fn try_receive_command(&self) -> Result> { - match self.command_receiver.recv_bytes(zmq::DONTWAIT) { - Ok(bytes) => { - debug!("Received command message ({} bytes)", bytes.len()); - - let command: AgentCommand = serde_json::from_slice(&bytes) - .map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?; - - debug!("Parsed command: {:?}", command); - Ok(Some(command)) - } - Err(zmq::Error::EAGAIN) => { - // No message available (non-blocking) - Ok(None) - } - Err(e) => Err(anyhow::anyhow!("ZMQ receive error: {}", e)), - } - } -} - -/// Commands that can be sent to the agent -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] -pub enum AgentCommand { - /// Request immediate metric collection - CollectNow, - /// Change collection interval - SetInterval { seconds: u64 }, - /// Enable/disable a collector - ToggleCollector { name: String, enabled: bool }, - /// Request status/health check - Ping, } diff --git a/agent/src/config/mod.rs b/agent/src/config/mod.rs index 8593b54..0dd8f70 100644 --- a/agent/src/config/mod.rs +++ b/agent/src/config/mod.rs @@ -20,7 +20,6 @@ pub struct AgentConfig { #[derive(Debug, Clone, Serialize, Deserialize)] pub struct ZmqConfig { pub publisher_port: u16, - pub command_port: u16, pub bind_address: String, pub transmission_interval_seconds: u64, /// Heartbeat transmission interval in seconds for host connectivity detection diff --git a/agent/src/config/validation.rs b/agent/src/config/validation.rs index 2747418..410770d 100644 --- a/agent/src/config/validation.rs +++ b/agent/src/config/validation.rs @@ -7,14 +7,6 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> { bail!("ZMQ publisher port cannot be 0"); } - if config.zmq.command_port == 0 { - bail!("ZMQ command port cannot be 0"); - } - - if config.zmq.publisher_port == config.zmq.command_port { - bail!("ZMQ publisher and command ports cannot be the same"); - } - if config.zmq.bind_address.is_empty() { bail!("ZMQ bind address cannot be empty"); } diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index da9e534..680323c 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.196" +version = "0.1.197" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index af42308..397ef3c 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.196" +version = "0.1.197" edition = "2021" [dependencies] From 7ad149bbe4b4b84bb06292bcc6a702e509190121 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Fri, 28 Nov 2025 11:46:28 +0100 Subject: [PATCH 12/14] Replace all systemctl commands with zbus D-Bus API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Complete migration from systemctl subprocess calls to native D-Bus communication: **Removed systemctl commands:** - systemctl is-active (fallback) - use D-Bus cache from ListUnits - systemctl show --property=LoadState,ActiveState,SubState - use D-Bus cache - systemctl show --property=WorkingDirectory - use D-Bus Properties.Get - systemctl show --property=MemoryCurrent - use D-Bus Properties.Get - systemctl show nginx --property=ExecStart - use D-Bus Properties.Get **Implementation details:** - Added get_unit_property() helper for D-Bus property access - Made get_nginx_site_metrics() async to support D-Bus calls - Made get_nginx_sites_internal() async - Made discover_nginx_sites() async - Made get_nginx_config_from_systemd() async - Fixed RwLock guard Send issues by using scoped locks **Remaining external commands:** - smartctl (disk.rs) - No Rust alternative for SMART data - sudo du (systemd.rs) - Directory size measurement - nginx -T (systemd.rs) - Nginx config fallback - timeout hostname (nixos.rs) - Rare fallback only Version bump: 0.1.197 → 0.1.198 --- Cargo.lock | 6 +- agent/Cargo.toml | 2 +- agent/src/collectors/systemd.rs | 174 ++++++++++++++++---------------- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 5 files changed, 93 insertions(+), 93 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 716aaae..818b66e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,7 +493,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.197" +version = "0.1.198" dependencies = [ "anyhow", "chrono", @@ -515,7 +515,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.197" +version = "0.1.198" dependencies = [ "anyhow", "async-trait", @@ -545,7 +545,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.197" +version = "0.1.198" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 268a9d4..07fa8cc 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.197" +version = "0.1.198" edition = "2021" [dependencies] diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 9affb64..f1e17b9 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -97,7 +97,7 @@ impl SystemdCollector { // Sub-service metrics for specific services (always include cached results) if service_name.contains("nginx") && active_status == "active" { - let nginx_sites = self.get_nginx_site_metrics(); + let nginx_sites = self.get_nginx_site_metrics().await; for (site_name, latency_ms) in nginx_sites { let site_status = if latency_ms >= 0.0 && latency_ms < self.config.nginx_latency_critical_ms { "active" @@ -231,27 +231,35 @@ impl SystemdCollector { } /// Get nginx site metrics, checking them if cache is expired (like old working version) - fn get_nginx_site_metrics(&self) -> Vec<(String, f32)> { - let mut state = self.state.write().unwrap(); - - // Check if we need to refresh nginx site metrics - let needs_refresh = match state.last_nginx_check_time { - None => true, // First time - Some(last_time) => { - let elapsed = last_time.elapsed().as_secs(); - elapsed >= state.nginx_check_interval_seconds + async fn get_nginx_site_metrics(&self) -> Vec<(String, f32)> { + // Check if we need to refresh (read lock) + let needs_refresh = { + let state = self.state.read().unwrap(); + match state.last_nginx_check_time { + None => true, + Some(last_time) => { + let elapsed = last_time.elapsed().as_secs(); + elapsed >= state.nginx_check_interval_seconds + } } }; if needs_refresh { - // Only check nginx sites if nginx service is active - if state.monitored_services.iter().any(|s| s.contains("nginx")) { - let fresh_metrics = self.get_nginx_sites_internal(); + // Check if nginx is active (read lock) + let has_nginx = { + let state = self.state.read().unwrap(); + state.monitored_services.iter().any(|s| s.contains("nginx")) + }; + + if has_nginx { + let fresh_metrics = self.get_nginx_sites_internal().await; + let mut state = self.state.write().unwrap(); state.nginx_site_metrics = fresh_metrics; state.last_nginx_check_time = Some(Instant::now()); } } + let state = self.state.read().unwrap(); state.nginx_site_metrics.clone() } @@ -322,9 +330,9 @@ impl SystemdCollector { Ok((services, service_status_cache)) } - /// Get service status from cache (if available) or fallback to systemctl + /// Get service status from D-Bus cache fn get_service_status(&self, service: &str) -> Result<(String, String)> { - // Try to get status from cache first + // Get status from D-Bus cache (populated by discover_services_internal) if let Ok(state) = self.state.read() { if let Some(cached_info) = state.service_status_cache.get(service) { let active_status = cached_info.active_state.clone(); @@ -338,20 +346,45 @@ impl SystemdCollector { } } - // Fallback to systemctl if not in cache (with 2 second timeout) - let output = Command::new("timeout") - .args(&["2", "systemctl", "is-active", &format!("{}.service", service)]) - .output()?; + // Service not found in D-Bus cache - treat as inactive + Ok(("inactive".to_string(), "LoadState=not-found\nActiveState=inactive\nSubState=dead".to_string())) + } - let active_status = String::from_utf8(output.stdout)?.trim().to_string(); + /// Get a unit property via D-Bus + async fn get_unit_property(&self, service_name: &str, property: &str) -> Option { + // Connect to system D-Bus + let connection = Connection::system().await.ok()?; - // Get more detailed info (with 2 second timeout) - let output = Command::new("timeout") - .args(&["2", "systemctl", "show", &format!("{}.service", service), "--property=LoadState,ActiveState,SubState"]) - .output()?; + // Get systemd manager proxy + let manager_proxy = zbus::Proxy::new( + &connection, + "org.freedesktop.systemd1", + "/org/freedesktop/systemd1", + "org.freedesktop.systemd1.Manager", + ).await.ok()?; - let detailed_info = String::from_utf8(output.stdout)?; - Ok((active_status, detailed_info)) + // Get unit path for service + let unit_name = format!("{}.service", service_name); + let unit_path: zbus::zvariant::OwnedObjectPath = manager_proxy + .call("GetUnit", &(unit_name,)) + .await + .ok()?; + + // Get property using standard D-Bus Properties interface + let prop_proxy = zbus::Proxy::new( + &connection, + "org.freedesktop.systemd1", + unit_path.as_str(), + "org.freedesktop.DBus.Properties", + ).await.ok()?; + + // Try Service interface first, fallback to Unit interface + // Get returns a Variant, we need to extract the inner value + if let Ok(variant) = prop_proxy.call("Get", &("org.freedesktop.systemd1.Service", property)).await { + return Some(variant); + } + + prop_proxy.call("Get", &("org.freedesktop.systemd1.Unit", property)).await.ok() } /// Check if service name matches pattern (supports wildcards like nginx*) @@ -407,21 +440,12 @@ impl SystemdCollector { return Ok(0.0); } - // No configured path - try to get WorkingDirectory from systemctl (with 2 second timeout) - let output = Command::new("timeout") - .args(&["2", "systemctl", "show", &format!("{}.service", service_name), "--property=WorkingDirectory"]) - .output() - .map_err(|e| CollectorError::SystemRead { - path: format!("WorkingDirectory for {}", service_name), - error: e.to_string(), - })?; - - let output_str = String::from_utf8_lossy(&output.stdout); - for line in output_str.lines() { - if line.starts_with("WorkingDirectory=") && !line.contains("[not set]") { - let dir = line.strip_prefix("WorkingDirectory=").unwrap_or(""); - if !dir.is_empty() && dir != "/" { - return Ok(self.get_directory_size(dir).await.unwrap_or(0.0)); + // No configured path - try to get WorkingDirectory from D-Bus + if let Some(value) = self.get_unit_property(service_name, "WorkingDirectory").await { + // WorkingDirectory is a string property - try to extract as string + if let Ok(dir_str) = ::try_from(value) { + if !dir_str.is_empty() && dir_str != "/" && dir_str != "[not set]" { + return Ok(self.get_directory_size(&dir_str).await.unwrap_or(0.0)); } } } @@ -484,27 +508,13 @@ impl SystemdCollector { } } - /// Get memory usage for a specific service + /// Get memory usage for a specific service via D-Bus async fn get_service_memory_usage(&self, service_name: &str) -> Result { - let output = Command::new("systemctl") - .args(&["show", &format!("{}.service", service_name), "--property=MemoryCurrent"]) - .output() - .map_err(|e| CollectorError::SystemRead { - path: format!("memory usage for {}", service_name), - error: e.to_string(), - })?; - - let output_str = String::from_utf8_lossy(&output.stdout); - - for line in output_str.lines() { - if line.starts_with("MemoryCurrent=") { - if let Some(mem_str) = line.strip_prefix("MemoryCurrent=") { - if mem_str != "[not set]" { - if let Ok(memory_bytes) = mem_str.parse::() { - return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB - } - } - } + // Get MemoryCurrent property from D-Bus + if let Some(value) = self.get_unit_property(service_name, "MemoryCurrent").await { + // MemoryCurrent is a u64 property (bytes) - try to extract + if let Ok(memory_bytes) = ::try_from(value) { + return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB } } @@ -535,11 +545,11 @@ impl SystemdCollector { } /// Get nginx sites with latency checks (internal - no caching) - fn get_nginx_sites_internal(&self) -> Vec<(String, f32)> { + async fn get_nginx_sites_internal(&self) -> Vec<(String, f32)> { let mut sites = Vec::new(); // Discover nginx sites from configuration - let discovered_sites = self.discover_nginx_sites(); + let discovered_sites = self.discover_nginx_sites().await; // Always add all discovered sites, even if checks fail (like old version) for (site_name, url) in &discovered_sites { @@ -558,9 +568,9 @@ impl SystemdCollector { } /// Discover nginx sites from configuration - fn discover_nginx_sites(&self) -> Vec<(String, String)> { + async fn discover_nginx_sites(&self) -> Vec<(String, String)> { // Use the same approach as the old working agent: get nginx config from systemd - let config_content = match self.get_nginx_config_from_systemd() { + let config_content = match self.get_nginx_config_from_systemd().await { Some(content) => content, None => { debug!("Could not get nginx config from systemd, trying nginx -T fallback"); @@ -593,30 +603,20 @@ impl SystemdCollector { Some(String::from_utf8_lossy(&output.stdout).to_string()) } - /// Get nginx config from systemd service definition (NixOS compatible) - fn get_nginx_config_from_systemd(&self) -> Option { - let output = Command::new("systemctl") - .args(&["show", "nginx", "--property=ExecStart", "--no-pager"]) - .output() - .ok()?; + /// Get nginx config from systemd service definition via D-Bus (NixOS compatible) + async fn get_nginx_config_from_systemd(&self) -> Option { + // Get ExecStart property from D-Bus + let value = self.get_unit_property("nginx", "ExecStart").await?; - if !output.status.success() { - debug!("Failed to get nginx ExecStart from systemd"); - return None; - } + // ExecStart is a complex structure: array of (path, args, unclean_exit) + // For our purposes, we need to extract the command line + let exec_start_str = format!("{:?}", value); + debug!("nginx ExecStart from D-Bus: {}", exec_start_str); - let stdout = String::from_utf8_lossy(&output.stdout); - debug!("systemctl show nginx output: {}", stdout); - - // Parse ExecStart to extract -c config path - for line in stdout.lines() { - if line.starts_with("ExecStart=") { - debug!("Found ExecStart line: {}", line); - if let Some(config_path) = self.extract_config_path_from_exec_start(line) { - debug!("Extracted config path: {}", config_path); - return std::fs::read_to_string(&config_path).ok(); - } - } + // Extract config path from ExecStart structure + if let Some(config_path) = self.extract_config_path_from_exec_start(&exec_start_str) { + debug!("Extracted config path: {}", config_path); + return std::fs::read_to_string(&config_path).ok(); } None diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 680323c..bd441f2 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.197" +version = "0.1.198" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 397ef3c..708b362 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.197" +version = "0.1.198" edition = "2021" [dependencies] From eab3f17428eefb1da04823a4dbf7fcda8fd2fabf Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Fri, 28 Nov 2025 11:57:31 +0100 Subject: [PATCH 13/14] Fix agent hang by reverting service discovery to systemctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The D-Bus ListUnits call in discover_services_internal() was causing the agent to hang on startup. **Root cause:** - D-Bus ListUnits call with complex tuple destructuring hung indefinitely - Agent never completed first collection cycle - No collector output in logs **Fix:** - Revert discover_services_internal() to use systemctl list-units/list-unit-files - Keep D-Bus-based property queries (WorkingDirectory, MemoryCurrent, ExecStart) - Hybrid approach: systemctl for discovery, D-Bus for individual queries **External commands still used:** - systemctl list-units, list-unit-files (service discovery) - smartctl (SMART data) - sudo du (directory sizes) - nginx -T (config fallback) Version bump: 0.1.198 → 0.1.199 --- Cargo.lock | 6 +-- agent/Cargo.toml | 2 +- agent/src/collectors/systemd.rs | 84 +++++++++++++++++++++------------ dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 5 files changed, 61 insertions(+), 35 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 818b66e..5a472ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -493,7 +493,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.198" +version = "0.1.199" dependencies = [ "anyhow", "chrono", @@ -515,7 +515,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.198" +version = "0.1.199" dependencies = [ "anyhow", "async-trait", @@ -545,7 +545,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.198" +version = "0.1.199" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 07fa8cc..d99bcb9 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.198" +version = "0.1.199" edition = "2021" [dependencies] diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index f1e17b9..2cd425d 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -263,45 +263,71 @@ impl SystemdCollector { state.nginx_site_metrics.clone() } - /// Auto-discover interesting services to monitor using D-Bus + /// Auto-discover interesting services to monitor using systemctl async fn discover_services_internal(&self) -> Result<(Vec, std::collections::HashMap)> { - // Connect to system D-Bus - let connection = Connection::system().await?; + // First: Get all service unit files (with 3 second timeout) + let unit_files_output = Command::new("timeout") + .args(&["3", "systemctl", "list-unit-files", "--type=service", "--no-pager", "--plain"]) + .output()?; - // Get systemd manager proxy - let proxy = zbus::Proxy::new( - &connection, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - ).await?; + if !unit_files_output.status.success() { + return Err(anyhow::anyhow!("systemctl list-unit-files command failed")); + } - // List all units via D-Bus - let units: Vec<(String, String, String, String, String, String, zbus::zvariant::OwnedObjectPath, u32, String, zbus::zvariant::OwnedObjectPath)> = - proxy.call("ListUnits", &()).await?; + // Second: Get runtime status of all units (with 3 second timeout) + let units_status_output = Command::new("timeout") + .args(&["3", "systemctl", "list-units", "--type=service", "--all", "--no-pager", "--plain"]) + .output()?; + if !units_status_output.status.success() { + return Err(anyhow::anyhow!("systemctl list-units command failed")); + } + + let unit_files_str = String::from_utf8(unit_files_output.stdout)?; + let units_status_str = String::from_utf8(units_status_output.stdout)?; + let mut services = Vec::new(); + + let excluded_services = &self.config.excluded_services; + let service_name_filters = &self.config.service_name_filters; + + // Parse all service unit files let mut all_service_names = std::collections::HashSet::new(); - let mut service_status_cache = std::collections::HashMap::new(); - - // Parse D-Bus response for services only - for unit in units { - let (unit_name, _description, load_state, active_state, sub_state, _followed, _unit_path, _job_id, _job_type, _job_path) = unit; - - if unit_name.ends_with(".service") { - let service_name = unit_name.trim_end_matches(".service"); + for line in unit_files_str.lines() { + let fields: Vec<&str> = line.split_whitespace().collect(); + if fields.len() >= 2 && fields[0].ends_with(".service") { + let service_name = fields[0].trim_end_matches(".service"); all_service_names.insert(service_name.to_string()); + } + } - service_status_cache.insert(service_name.to_string(), ServiceStatusInfo { - load_state: load_state.clone(), - active_state: active_state.clone(), - sub_state: sub_state.clone(), + // Parse runtime status for all units + let mut status_cache = std::collections::HashMap::new(); + for line in units_status_str.lines() { + let fields: Vec<&str> = line.split_whitespace().collect(); + if fields.len() >= 4 && fields[0].ends_with(".service") { + let service_name = fields[0].trim_end_matches(".service"); + let load_state = fields.get(1).unwrap_or(&"unknown").to_string(); + let active_state = fields.get(2).unwrap_or(&"unknown").to_string(); + let sub_state = fields.get(3).unwrap_or(&"unknown").to_string(); + + status_cache.insert(service_name.to_string(), ServiceStatusInfo { + load_state, + active_state, + sub_state, }); } } - let mut services = Vec::new(); - let excluded_services = &self.config.excluded_services; - let service_name_filters = &self.config.service_name_filters; + // For services found in unit files but not in runtime status, set default inactive status + for service_name in &all_service_names { + if !status_cache.contains_key(service_name) { + status_cache.insert(service_name.to_string(), ServiceStatusInfo { + load_state: "not-loaded".to_string(), + active_state: "inactive".to_string(), + sub_state: "dead".to_string(), + }); + } + } // Process all discovered services and apply filters for service_name in &all_service_names { @@ -327,7 +353,7 @@ impl SystemdCollector { } } - Ok((services, service_status_cache)) + Ok((services, status_cache)) } /// Get service status from D-Bus cache diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index bd441f2..16f49f6 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.198" +version = "0.1.199" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 708b362..af20240 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.198" +version = "0.1.199" edition = "2021" [dependencies] From 85c6c624fb8901b98402cbc7efe1c9e41ca38182 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Fri, 28 Nov 2025 12:15:04 +0100 Subject: [PATCH 14/14] Revert D-Bus usage, use systemctl commands only - Remove zbus dependency from agent - Replace D-Bus Connection calls with systemctl show commands - Fix agent hang by eliminating blocking D-Bus operations - get_unit_property now uses systemctl show with property flags - Memory, disk usage, and nginx config queries use systemctl - Simpler, more reliable service monitoring --- Cargo.lock | 553 +------------------------------- agent/Cargo.toml | 7 +- agent/src/collectors/systemd.rs | 76 ++--- dashboard/Cargo.toml | 2 +- shared/Cargo.toml | 2 +- 5 files changed, 36 insertions(+), 604 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5a472ec..9de75d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,137 +103,6 @@ dependencies = [ "object", ] -[[package]] -name = "async-broadcast" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435a87a52755b8f27fcf321ac4f04b2802e337c8c4872923137471ec39c37532" -dependencies = [ - "event-listener", - "event-listener-strategy", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-channel" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" -dependencies = [ - "concurrent-queue", - "event-listener-strategy", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-executor" -version = "1.13.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "497c00e0fd83a72a79a39fcbd8e3e2f055d6f6c7e025f3b3d91f4f8e76527fb8" -dependencies = [ - "async-task", - "concurrent-queue", - "fastrand", - "futures-lite", - "pin-project-lite", - "slab", -] - -[[package]] -name = "async-fs" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8034a681df4aed8b8edbd7fbe472401ecf009251c8b40556b304567052e294c5" -dependencies = [ - "async-lock", - "blocking", - "futures-lite", -] - -[[package]] -name = "async-io" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" -dependencies = [ - "autocfg", - "cfg-if", - "concurrent-queue", - "futures-io", - "futures-lite", - "parking", - "polling", - "rustix", - "slab", - "windows-sys 0.61.2", -] - -[[package]] -name = "async-lock" -version = "3.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fd03604047cee9b6ce9de9f70c6cd540a0520c813cbd49bae61f33ab80ed1dc" -dependencies = [ - "event-listener", - "event-listener-strategy", - "pin-project-lite", -] - -[[package]] -name = "async-process" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" -dependencies = [ - "async-channel", - "async-io", - "async-lock", - "async-signal", - "async-task", - "blocking", - "cfg-if", - "event-listener", - "futures-lite", - "rustix", -] - -[[package]] -name = "async-recursion" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "async-signal" -version = "0.2.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" -dependencies = [ - "async-io", - "async-lock", - "atomic-waker", - "cfg-if", - "futures-core", - "futures-io", - "rustix", - "signal-hook-registry", - "slab", - "windows-sys 0.61.2", -] - -[[package]] -name = "async-task" -version = "4.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" - [[package]] name = "async-trait" version = "0.1.89" @@ -281,28 +150,6 @@ version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "blocking" -version = "1.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" -dependencies = [ - "async-channel", - "async-task", - "futures-io", - "futures-lite", - "piper", -] - [[package]] name = "bollard" version = "0.17.1" @@ -493,7 +340,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.199" +version = "0.1.200" dependencies = [ "anyhow", "chrono", @@ -515,7 +362,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.199" +version = "0.1.200" dependencies = [ "anyhow", "async-trait", @@ -539,13 +386,12 @@ dependencies = [ "toml", "tracing", "tracing-subscriber", - "zbus", "zmq", ] [[package]] name = "cm-dashboard-shared" -version = "0.1.199" +version = "0.1.200" dependencies = [ "chrono", "serde", @@ -559,15 +405,6 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" -[[package]] -name = "concurrent-queue" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" -dependencies = [ - "crossbeam-utils", -] - [[package]] name = "core-foundation" version = "0.9.4" @@ -584,15 +421,6 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - [[package]] name = "crossbeam" version = "0.8.4" @@ -674,16 +502,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "crypto-common" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" -dependencies = [ - "generic-array", - "typenum", -] - [[package]] name = "deranged" version = "0.5.5" @@ -694,16 +512,6 @@ dependencies = [ "serde_core", ] -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", -] - [[package]] name = "dircpy" version = "0.3.19" @@ -763,33 +571,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "endi" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66b7e2430c6dff6a955451e2cfc438f09cea1965a9d6f87f7e3b90decc014099" - -[[package]] -name = "enumflags2" -version = "0.7.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef" -dependencies = [ - "enumflags2_derive", - "serde", -] - -[[package]] -name = "enumflags2_derive" -version = "0.7.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "equivalent" version = "1.0.2" @@ -806,27 +587,6 @@ dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "event-listener" -version = "5.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" -dependencies = [ - "concurrent-queue", - "parking", - "pin-project-lite", -] - -[[package]] -name = "event-listener-strategy" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" -dependencies = [ - "event-listener", - "pin-project-lite", -] - [[package]] name = "fastrand" version = "2.3.0" @@ -923,19 +683,6 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" -[[package]] -name = "futures-lite" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" -dependencies = [ - "fastrand", - "futures-core", - "futures-io", - "parking", - "pin-project-lite", -] - [[package]] name = "futures-macro" version = "0.3.31" @@ -977,16 +724,6 @@ dependencies = [ "slab", ] -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - [[package]] name = "gethostname" version = "0.4.3" @@ -997,17 +734,6 @@ dependencies = [ "windows-targets 0.48.5", ] -[[package]] -name = "getrandom" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - [[package]] name = "getrandom" version = "0.3.4" @@ -1084,12 +810,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "hex" version = "0.4.3" @@ -1464,7 +1184,7 @@ version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" dependencies = [ - "getrandom 0.3.4", + "getrandom", "libc", ] @@ -1573,15 +1293,6 @@ version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - [[package]] name = "mime" version = "0.3.17" @@ -1713,7 +1424,6 @@ dependencies = [ "cfg-if", "cfg_aliases", "libc", - "memoffset", ] [[package]] @@ -1814,22 +1524,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "ordered-stream" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aa2b01e1d916879f73a53d01d1d6cee68adbb31d6d9177a8cfce093cced1d50" -dependencies = [ - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "parking" -version = "2.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" - [[package]] name = "parking_lot" version = "0.12.5" @@ -1924,37 +1618,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" -[[package]] -name = "piper" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" -dependencies = [ - "atomic-waker", - "fastrand", - "futures-io", -] - [[package]] name = "pkg-config" version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "polling" -version = "3.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" -dependencies = [ - "cfg-if", - "concurrent-queue", - "hermit-abi", - "pin-project-lite", - "rustix", - "windows-sys 0.61.2", -] - [[package]] name = "potential_utf" version = "0.1.4" @@ -1970,24 +1639,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "proc-macro-crate" -version = "3.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" -dependencies = [ - "toml_edit 0.23.7", -] - [[package]] name = "proc-macro2" version = "1.0.103" @@ -2034,18 +1685,6 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", "rand_core", ] @@ -2054,9 +1693,6 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom 0.2.16", -] [[package]] name = "ratatui" @@ -2410,17 +2046,6 @@ dependencies = [ "time", ] -[[package]] -name = "sha1" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - [[package]] name = "sharded-slab" version = "0.1.7" @@ -2523,12 +2148,6 @@ dependencies = [ "windows-sys 0.59.0", ] -[[package]] -name = "static_assertions" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" - [[package]] name = "strsim" version = "0.11.1" @@ -2632,7 +2251,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2d31c77bdf42a745371d260a26ca7163f1e0924b64afa0b688e61b5a9fa02f16" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom", "once_cell", "rustix", "windows-sys 0.61.2", @@ -2787,8 +2406,8 @@ checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", "serde_spanned", - "toml_datetime 0.6.11", - "toml_edit 0.22.27", + "toml_datetime", + "toml_edit", ] [[package]] @@ -2800,15 +2419,6 @@ dependencies = [ "serde", ] -[[package]] -name = "toml_datetime" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2cdb639ebbc97961c51720f858597f7f24c4fc295327923af55b74c3c724533" -dependencies = [ - "serde_core", -] - [[package]] name = "toml_edit" version = "0.22.27" @@ -2818,32 +2428,11 @@ dependencies = [ "indexmap 2.12.1", "serde", "serde_spanned", - "toml_datetime 0.6.11", + "toml_datetime", "toml_write", "winnow", ] -[[package]] -name = "toml_edit" -version = "0.23.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6485ef6d0d9b5d0ec17244ff7eb05310113c3f316f2d14200d4de56b3cb98f8d" -dependencies = [ - "indexmap 2.12.1", - "toml_datetime 0.7.3", - "toml_parser", - "winnow", -] - -[[package]] -name = "toml_parser" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0cbe268d35bdb4bb5a56a2de88d0ad0eb70af5384a99d648cd4b3d04039800e" -dependencies = [ - "winnow", -] - [[package]] name = "toml_write" version = "0.1.2" @@ -2923,23 +2512,6 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" -[[package]] -name = "typenum" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" - -[[package]] -name = "uds_windows" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89daebc3e6fd160ac4aa9fc8b3bf71e1f74fbf92367ae71fb83a037e8bf164b9" -dependencies = [ - "memoffset", - "tempfile", - "winapi", -] - [[package]] name = "unicode-ident" version = "1.0.22" @@ -3466,16 +3038,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xdg-home" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec1cdab258fb55c0da61328dc52c8764709b249011b2cad0454c72f0bf10a1f6" -dependencies = [ - "libc", - "windows-sys 0.59.0", -] - [[package]] name = "yoke" version = "0.8.1" @@ -3499,68 +3061,6 @@ dependencies = [ "synstructure", ] -[[package]] -name = "zbus" -version = "4.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb97012beadd29e654708a0fdb4c84bc046f537aecfde2c3ee0a9e4b4d48c725" -dependencies = [ - "async-broadcast", - "async-executor", - "async-fs", - "async-io", - "async-lock", - "async-process", - "async-recursion", - "async-task", - "async-trait", - "blocking", - "enumflags2", - "event-listener", - "futures-core", - "futures-sink", - "futures-util", - "hex", - "nix 0.29.0", - "ordered-stream", - "rand", - "serde", - "serde_repr", - "sha1", - "static_assertions", - "tracing", - "uds_windows", - "windows-sys 0.52.0", - "xdg-home", - "zbus_macros", - "zbus_names", - "zvariant", -] - -[[package]] -name = "zbus_macros" -version = "4.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "267db9407081e90bbfa46d841d3cbc60f59c0351838c4bc65199ecd79ab1983e" -dependencies = [ - "proc-macro-crate", - "proc-macro2", - "quote", - "syn", - "zvariant_utils", -] - -[[package]] -name = "zbus_names" -version = "3.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b9b1fef7d021261cc16cba64c351d291b715febe0fa10dc3a443ac5a5022e6c" -dependencies = [ - "serde", - "static_assertions", - "zvariant", -] - [[package]] name = "zerocopy" version = "0.8.30" @@ -3666,40 +3166,3 @@ dependencies = [ "system-deps", "zeromq-src", ] - -[[package]] -name = "zvariant" -version = "4.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2084290ab9a1c471c38fc524945837734fbf124487e105daec2bb57fd48c81fe" -dependencies = [ - "endi", - "enumflags2", - "serde", - "static_assertions", - "zvariant_derive", -] - -[[package]] -name = "zvariant_derive" -version = "4.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73e2ba546bda683a90652bac4a279bc146adad1386f25379cf73200d2002c449" -dependencies = [ - "proc-macro-crate", - "proc-macro2", - "quote", - "syn", - "zvariant_utils", -] - -[[package]] -name = "zvariant_utils" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c51bcff7cc3dbb5055396bcf774748c3dab426b4b8659046963523cee4808340" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] diff --git a/agent/Cargo.toml b/agent/Cargo.toml index d99bcb9..965d873 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.199" +version = "0.1.200" edition = "2021" [dependencies] @@ -30,7 +30,4 @@ futures = "0.3" libc = "0.2" # Docker API client -bollard = "0.17" - -# D-Bus client for systemd -zbus = "4.0" \ No newline at end of file +bollard = "0.17" \ No newline at end of file diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index 2cd425d..4194417 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -7,7 +7,6 @@ use std::time::Instant; use tracing::{debug, warn}; use bollard::Docker; use bollard::container::ListContainersOptions; -use zbus::Connection; use super::{Collector, CollectorError}; use crate::config::SystemdConfig; @@ -376,41 +375,21 @@ impl SystemdCollector { Ok(("inactive".to_string(), "LoadState=not-found\nActiveState=inactive\nSubState=dead".to_string())) } - /// Get a unit property via D-Bus - async fn get_unit_property(&self, service_name: &str, property: &str) -> Option { - // Connect to system D-Bus - let connection = Connection::system().await.ok()?; - - // Get systemd manager proxy - let manager_proxy = zbus::Proxy::new( - &connection, - "org.freedesktop.systemd1", - "/org/freedesktop/systemd1", - "org.freedesktop.systemd1.Manager", - ).await.ok()?; - - // Get unit path for service - let unit_name = format!("{}.service", service_name); - let unit_path: zbus::zvariant::OwnedObjectPath = manager_proxy - .call("GetUnit", &(unit_name,)) - .await + /// Get a unit property via systemctl show + fn get_unit_property(&self, service_name: &str, property: &str) -> Option { + let output = Command::new("systemctl") + .args(&["show", &format!("{}.service", service_name), &format!("--property={}", property)]) + .output() .ok()?; - // Get property using standard D-Bus Properties interface - let prop_proxy = zbus::Proxy::new( - &connection, - "org.freedesktop.systemd1", - unit_path.as_str(), - "org.freedesktop.DBus.Properties", - ).await.ok()?; - - // Try Service interface first, fallback to Unit interface - // Get returns a Variant, we need to extract the inner value - if let Ok(variant) = prop_proxy.call("Get", &("org.freedesktop.systemd1.Service", property)).await { - return Some(variant); + if !output.status.success() { + return None; } - prop_proxy.call("Get", &("org.freedesktop.systemd1.Unit", property)).await.ok() + let output_str = String::from_utf8(output.stdout).ok()?; + // Parse "PropertyName=value" format + let value = output_str.trim().strip_prefix(&format!("{}=", property))?; + Some(value.to_string()) } /// Check if service name matches pattern (supports wildcards like nginx*) @@ -466,13 +445,10 @@ impl SystemdCollector { return Ok(0.0); } - // No configured path - try to get WorkingDirectory from D-Bus - if let Some(value) = self.get_unit_property(service_name, "WorkingDirectory").await { - // WorkingDirectory is a string property - try to extract as string - if let Ok(dir_str) = ::try_from(value) { - if !dir_str.is_empty() && dir_str != "/" && dir_str != "[not set]" { - return Ok(self.get_directory_size(&dir_str).await.unwrap_or(0.0)); - } + // No configured path - try to get WorkingDirectory from systemctl + if let Some(dir_str) = self.get_unit_property(service_name, "WorkingDirectory") { + if !dir_str.is_empty() && dir_str != "/" && dir_str != "[not set]" { + return Ok(self.get_directory_size(&dir_str).await.unwrap_or(0.0)); } } @@ -534,12 +510,12 @@ impl SystemdCollector { } } - /// Get memory usage for a specific service via D-Bus + /// Get memory usage for a specific service via systemctl async fn get_service_memory_usage(&self, service_name: &str) -> Result { - // Get MemoryCurrent property from D-Bus - if let Some(value) = self.get_unit_property(service_name, "MemoryCurrent").await { - // MemoryCurrent is a u64 property (bytes) - try to extract - if let Ok(memory_bytes) = ::try_from(value) { + // Get MemoryCurrent property from systemctl + if let Some(value_str) = self.get_unit_property(service_name, "MemoryCurrent") { + // MemoryCurrent is in bytes or could be "[not set]" + if let Ok(memory_bytes) = value_str.parse::() { return Ok(memory_bytes as f32 / (1024.0 * 1024.0)); // Convert to MB } } @@ -629,15 +605,11 @@ impl SystemdCollector { Some(String::from_utf8_lossy(&output.stdout).to_string()) } - /// Get nginx config from systemd service definition via D-Bus (NixOS compatible) + /// Get nginx config from systemd service definition via systemctl (NixOS compatible) async fn get_nginx_config_from_systemd(&self) -> Option { - // Get ExecStart property from D-Bus - let value = self.get_unit_property("nginx", "ExecStart").await?; - - // ExecStart is a complex structure: array of (path, args, unclean_exit) - // For our purposes, we need to extract the command line - let exec_start_str = format!("{:?}", value); - debug!("nginx ExecStart from D-Bus: {}", exec_start_str); + // Get ExecStart property from systemctl + let exec_start_str = self.get_unit_property("nginx", "ExecStart")?; + debug!("nginx ExecStart from systemctl: {}", exec_start_str); // Extract config path from ExecStart structure if let Some(config_path) = self.extract_config_path_from_exec_start(&exec_start_str) { diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 16f49f6..0821774 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.199" +version = "0.1.200" edition = "2021" [dependencies] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index af20240..686dbce 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.199" +version = "0.1.200" edition = "2021" [dependencies]