From c56e9d7be29ab3a781252cb3b339a36154133487 Mon Sep 17 00:00:00 2001 From: Christoffer Martinsson Date: Thu, 30 Oct 2025 10:42:56 +0100 Subject: [PATCH] Implement user-stopped service tracking system Add comprehensive tracking for services stopped via dashboard to prevent false alerts when users intentionally stop services. Features: - User-stopped services report Status::Ok instead of Warning - Persistent storage survives agent restarts - Dashboard sends UserStart/UserStop commands - Agent tracks and syncs user-stopped state globally - Systemd collector respects user-stopped flags Implementation: - New service_tracker module with persistent JSON storage - Enhanced ServiceAction enum with UserStart/UserStop variants - Global singleton tracker accessible by collectors - Service status logic updated to check user-stopped flag - Dashboard version now uses CARGO_PKG_VERSION automatically Bump version to v0.1.43 --- Cargo.lock | 6 +- agent/Cargo.toml | 2 +- agent/src/agent.rs | 44 +++++++- agent/src/collectors/systemd.rs | 17 ++- agent/src/communication/mod.rs | 2 + agent/src/main.rs | 1 + agent/src/service_tracker.rs | 172 +++++++++++++++++++++++++++++ dashboard/Cargo.toml | 2 +- dashboard/src/app.rs | 8 +- dashboard/src/communication/mod.rs | 2 + dashboard/src/main.rs | 6 +- shared/Cargo.toml | 2 +- 12 files changed, 239 insertions(+), 25 deletions(-) create mode 100644 agent/src/service_tracker.rs diff --git a/Cargo.lock b/Cargo.lock index c110bb0..e83c35a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d" [[package]] name = "cm-dashboard" -version = "0.1.40" +version = "0.1.43" dependencies = [ "anyhow", "chrono", @@ -291,7 +291,7 @@ dependencies = [ [[package]] name = "cm-dashboard-agent" -version = "0.1.40" +version = "0.1.43" dependencies = [ "anyhow", "async-trait", @@ -314,7 +314,7 @@ dependencies = [ [[package]] name = "cm-dashboard-shared" -version = "0.1.40" +version = "0.1.43" dependencies = [ "chrono", "serde", diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 2a00d9a..ea19298 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-agent" -version = "0.1.42" +version = "0.1.43" edition = "2021" [dependencies] diff --git a/agent/src/agent.rs b/agent/src/agent.rs index d2dd1a6..d711366 100644 --- a/agent/src/agent.rs +++ b/agent/src/agent.rs @@ -8,6 +8,7 @@ use crate::communication::{AgentCommand, ServiceAction, ZmqHandler}; use crate::config::AgentConfig; use crate::metrics::MetricCollectionManager; use crate::notifications::NotificationManager; +use crate::service_tracker::UserStoppedServiceTracker; use crate::status::HostStatusManager; use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status}; @@ -18,6 +19,7 @@ pub struct Agent { metric_manager: MetricCollectionManager, notification_manager: NotificationManager, host_status_manager: HostStatusManager, + service_tracker: UserStoppedServiceTracker, } impl Agent { @@ -50,6 +52,10 @@ impl Agent { let host_status_manager = HostStatusManager::new(config.status_aggregation.clone()); info!("Host status manager initialized"); + // Initialize user-stopped service tracker + let service_tracker = UserStoppedServiceTracker::init_global()?; + info!("User-stopped service tracker initialized"); + Ok(Self { hostname, config, @@ -57,6 +63,7 @@ impl Agent { metric_manager, notification_manager, host_status_manager, + service_tracker, }) } @@ -271,13 +278,38 @@ impl Agent { /// Handle systemd service control commands async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> { - let action_str = match action { - ServiceAction::Start => "start", - ServiceAction::Stop => "stop", - ServiceAction::Status => "status", + let (action_str, is_user_action) = match action { + ServiceAction::Start => ("start", false), + ServiceAction::Stop => ("stop", false), + ServiceAction::Status => ("status", false), + ServiceAction::UserStart => ("start", true), + ServiceAction::UserStop => ("stop", true), }; - info!("Executing systemctl {} {}", action_str, service_name); + info!("Executing systemctl {} {} (user action: {})", action_str, service_name, is_user_action); + + // Handle user-stopped service tracking before systemctl execution + match action { + ServiceAction::UserStop => { + info!("Marking service '{}' as user-stopped", service_name); + if let Err(e) = self.service_tracker.mark_user_stopped(service_name) { + error!("Failed to mark service as user-stopped: {}", e); + } else { + // Sync to global tracker + UserStoppedServiceTracker::update_global(&self.service_tracker); + } + } + ServiceAction::UserStart => { + info!("Clearing user-stopped flag for service '{}'", service_name); + if let Err(e) = self.service_tracker.clear_user_stopped(service_name) { + error!("Failed to clear user-stopped flag: {}", e); + } else { + // Sync to global tracker + UserStoppedServiceTracker::update_global(&self.service_tracker); + } + } + _ => {} + } let output = tokio::process::Command::new("sudo") .arg("systemctl") @@ -298,7 +330,7 @@ impl Agent { } // Force refresh metrics after service control to update service status - if matches!(action, ServiceAction::Start | ServiceAction::Stop) { + if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::UserStart | ServiceAction::UserStop) { info!("Triggering immediate metric refresh after service control"); if let Err(e) = self.collect_metrics_only().await { error!("Failed to refresh metrics after service control: {}", e); diff --git a/agent/src/collectors/systemd.rs b/agent/src/collectors/systemd.rs index d379adf..f8966ef 100644 --- a/agent/src/collectors/systemd.rs +++ b/agent/src/collectors/systemd.rs @@ -8,6 +8,7 @@ use tracing::debug; use super::{Collector, CollectorError}; use crate::config::SystemdConfig; +use crate::service_tracker::UserStoppedServiceTracker; /// Systemd collector for monitoring systemd services pub struct SystemdCollector { @@ -353,11 +354,19 @@ impl SystemdCollector { Ok((active_status, detailed_info)) } - /// Calculate service status - fn calculate_service_status(&self, active_status: &str) -> Status { + /// Calculate service status, taking user-stopped services into account + fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status { match active_status.to_lowercase().as_str() { "active" => Status::Ok, - "inactive" | "dead" => Status::Warning, + "inactive" | "dead" => { + // Check if this service was stopped by user action + if UserStoppedServiceTracker::is_service_user_stopped(service_name) { + debug!("Service '{}' is inactive but marked as user-stopped - treating as OK", service_name); + Status::Ok + } else { + Status::Warning + } + }, "failed" | "error" => Status::Critical, "activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending, _ => Status::Unknown, @@ -480,7 +489,7 @@ impl Collector for SystemdCollector { for service in &monitored_services { match self.get_service_status(service) { Ok((active_status, _detailed_info)) => { - let status = self.calculate_service_status(&active_status); + let status = self.calculate_service_status(service, &active_status); // Individual service status metric metrics.push(Metric { diff --git a/agent/src/communication/mod.rs b/agent/src/communication/mod.rs index 0cbe1a0..2321183 100644 --- a/agent/src/communication/mod.rs +++ b/agent/src/communication/mod.rs @@ -113,4 +113,6 @@ pub enum ServiceAction { Start, Stop, Status, + UserStart, // User-initiated start (clears user-stopped flag) + UserStop, // User-initiated stop (marks as user-stopped) } diff --git a/agent/src/main.rs b/agent/src/main.rs index 7fc3f7a..75dd919 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -9,6 +9,7 @@ mod communication; mod config; mod metrics; mod notifications; +mod service_tracker; mod status; use agent::Agent; diff --git a/agent/src/service_tracker.rs b/agent/src/service_tracker.rs new file mode 100644 index 0000000..3f7d992 --- /dev/null +++ b/agent/src/service_tracker.rs @@ -0,0 +1,172 @@ +use anyhow::Result; +use serde::{Deserialize, Serialize}; +use std::collections::HashSet; +use std::fs; +use std::path::Path; +use std::sync::{Arc, Mutex, OnceLock}; +use tracing::{debug, info, warn}; + +/// Shared instance for global access +static GLOBAL_TRACKER: OnceLock>> = OnceLock::new(); + +/// Tracks services that have been stopped by user action +/// These services should be treated as OK status instead of Warning +#[derive(Debug)] +pub struct UserStoppedServiceTracker { + /// Set of services stopped by user action + user_stopped_services: HashSet, + /// Path to persistent storage file + storage_path: String, +} + +/// Serializable data structure for persistence +#[derive(Debug, Serialize, Deserialize)] +struct UserStoppedData { + services: Vec, +} + +impl UserStoppedServiceTracker { + /// Create new tracker with default storage path + pub fn new() -> Self { + Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json") + } + + /// Initialize global instance (called by agent) + pub fn init_global() -> Result { + let tracker = Self::new(); + + // Set global instance + let global_instance = Arc::new(Mutex::new(tracker)); + if GLOBAL_TRACKER.set(global_instance).is_err() { + warn!("Global service tracker was already initialized"); + } + + // Return a new instance for the agent to use + Ok(Self::new()) + } + + /// Check if a service is user-stopped (global access for collectors) + pub fn is_service_user_stopped(service_name: &str) -> bool { + if let Some(global) = GLOBAL_TRACKER.get() { + if let Ok(tracker) = global.lock() { + tracker.is_user_stopped(service_name) + } else { + debug!("Failed to lock global service tracker"); + false + } + } else { + debug!("Global service tracker not initialized"); + false + } + } + + /// Update global tracker (called by agent when tracker state changes) + pub fn update_global(updated_tracker: &UserStoppedServiceTracker) { + if let Some(global) = GLOBAL_TRACKER.get() { + if let Ok(mut tracker) = global.lock() { + tracker.user_stopped_services = updated_tracker.user_stopped_services.clone(); + } else { + debug!("Failed to lock global service tracker for update"); + } + } else { + debug!("Global service tracker not initialized for update"); + } + } + + /// Create new tracker with custom storage path + pub fn with_storage_path>(storage_path: P) -> Self { + let storage_path = storage_path.as_ref().to_string_lossy().to_string(); + let mut tracker = Self { + user_stopped_services: HashSet::new(), + storage_path, + }; + + // Load existing data from storage + if let Err(e) = tracker.load_from_storage() { + warn!("Failed to load user-stopped services from storage: {}", e); + info!("Starting with empty user-stopped services list"); + } + + tracker + } + + /// Mark a service as user-stopped + pub fn mark_user_stopped(&mut self, service_name: &str) -> Result<()> { + info!("Marking service '{}' as user-stopped", service_name); + self.user_stopped_services.insert(service_name.to_string()); + self.save_to_storage()?; + debug!("Service '{}' marked as user-stopped and saved to storage", service_name); + Ok(()) + } + + /// Clear user-stopped flag for a service (when user starts it) + pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> { + if self.user_stopped_services.remove(service_name) { + info!("Cleared user-stopped flag for service '{}'", service_name); + self.save_to_storage()?; + debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name); + } else { + debug!("Service '{}' was not marked as user-stopped", service_name); + } + Ok(()) + } + + /// Check if a service is marked as user-stopped + pub fn is_user_stopped(&self, service_name: &str) -> bool { + let is_stopped = self.user_stopped_services.contains(service_name); + debug!("Service '{}' user-stopped status: {}", service_name, is_stopped); + is_stopped + } + + + /// Save current state to persistent storage + fn save_to_storage(&self) -> Result<()> { + // Create parent directory if it doesn't exist + if let Some(parent_dir) = Path::new(&self.storage_path).parent() { + if !parent_dir.exists() { + fs::create_dir_all(parent_dir)?; + debug!("Created parent directory: {}", parent_dir.display()); + } + } + + let data = UserStoppedData { + services: self.user_stopped_services.iter().cloned().collect(), + }; + + let json_data = serde_json::to_string_pretty(&data)?; + fs::write(&self.storage_path, json_data)?; + + debug!( + "Saved {} user-stopped services to {}", + data.services.len(), + self.storage_path + ); + Ok(()) + } + + /// Load state from persistent storage + fn load_from_storage(&mut self) -> Result<()> { + if !Path::new(&self.storage_path).exists() { + debug!("Storage file {} does not exist, starting fresh", self.storage_path); + return Ok(()); + } + + let json_data = fs::read_to_string(&self.storage_path)?; + let data: UserStoppedData = serde_json::from_str(&json_data)?; + + self.user_stopped_services = data.services.into_iter().collect(); + + info!( + "Loaded {} user-stopped services from {}", + self.user_stopped_services.len(), + self.storage_path + ); + + if !self.user_stopped_services.is_empty() { + debug!("User-stopped services: {:?}", self.user_stopped_services); + } + + Ok(()) + } +} + diff --git a/dashboard/Cargo.toml b/dashboard/Cargo.toml index 3c8fdf0..af74761 100644 --- a/dashboard/Cargo.toml +++ b/dashboard/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard" -version = "0.1.42" +version = "0.1.43" edition = "2021" [dependencies] diff --git a/dashboard/src/app.rs b/dashboard/src/app.rs index d39aea1..a1feb31 100644 --- a/dashboard/src/app.rs +++ b/dashboard/src/app.rs @@ -295,18 +295,18 @@ impl Dashboard { async fn execute_ui_command(&self, command: UiCommand) -> Result<()> { match command { UiCommand::ServiceStart { hostname, service_name } => { - info!("Sending start command for service {} on {}", service_name, hostname); + info!("Sending user start command for service {} on {}", service_name, hostname); let agent_command = AgentCommand::ServiceControl { service_name: service_name.clone(), - action: ServiceAction::Start, + action: ServiceAction::UserStart, }; self.zmq_command_sender.send_command(&hostname, agent_command).await?; } UiCommand::ServiceStop { hostname, service_name } => { - info!("Sending stop command for service {} on {}", service_name, hostname); + info!("Sending user stop command for service {} on {}", service_name, hostname); let agent_command = AgentCommand::ServiceControl { service_name: service_name.clone(), - action: ServiceAction::Stop, + action: ServiceAction::UserStop, }; self.zmq_command_sender.send_command(&hostname, agent_command).await?; } diff --git a/dashboard/src/communication/mod.rs b/dashboard/src/communication/mod.rs index da93bdb..2d9688d 100644 --- a/dashboard/src/communication/mod.rs +++ b/dashboard/src/communication/mod.rs @@ -36,6 +36,8 @@ pub enum ServiceAction { Start, Stop, Status, + UserStart, // User-initiated start (clears user-stopped flag) + UserStop, // User-initiated stop (marks as user-stopped) } /// ZMQ consumer for receiving metrics from agents diff --git a/dashboard/src/main.rs b/dashboard/src/main.rs index 7093675..0429706 100644 --- a/dashboard/src/main.rs +++ b/dashboard/src/main.rs @@ -12,10 +12,6 @@ mod ui; use app::Dashboard; -/// Get hardcoded version -fn get_version() -> &'static str { - "v0.1.42" -} /// Check if running inside tmux session fn check_tmux_session() { @@ -42,7 +38,7 @@ fn check_tmux_session() { #[derive(Parser)] #[command(name = "cm-dashboard")] #[command(about = "CM Dashboard TUI with individual metric consumption")] -#[command(version = get_version())] +#[command(version)] struct Cli { /// Increase logging verbosity (-v, -vv) #[arg(short, long, action = clap::ArgAction::Count)] diff --git a/shared/Cargo.toml b/shared/Cargo.toml index 48bdccd..b948e18 100644 --- a/shared/Cargo.toml +++ b/shared/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cm-dashboard-shared" -version = "0.1.42" +version = "0.1.43" edition = "2021" [dependencies]