Implement user-stopped service tracking system
All checks were successful
Build and Release / build-and-release (push) Successful in 2m34s

Add comprehensive tracking for services stopped via dashboard to prevent
false alerts when users intentionally stop services.

Features:
- User-stopped services report Status::Ok instead of Warning
- Persistent storage survives agent restarts
- Dashboard sends UserStart/UserStop commands
- Agent tracks and syncs user-stopped state globally
- Systemd collector respects user-stopped flags

Implementation:
- New service_tracker module with persistent JSON storage
- Enhanced ServiceAction enum with UserStart/UserStop variants
- Global singleton tracker accessible by collectors
- Service status logic updated to check user-stopped flag
- Dashboard version now uses CARGO_PKG_VERSION automatically

Bump version to v0.1.43
This commit is contained in:
Christoffer Martinsson 2025-10-30 10:42:56 +01:00
parent c8f800a1e5
commit c56e9d7be2
12 changed files with 239 additions and 25 deletions

6
Cargo.lock generated
View File

@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
[[package]] [[package]]
name = "cm-dashboard" name = "cm-dashboard"
version = "0.1.40" version = "0.1.43"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"chrono", "chrono",
@ -291,7 +291,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-agent" name = "cm-dashboard-agent"
version = "0.1.40" version = "0.1.43"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-trait", "async-trait",
@ -314,7 +314,7 @@ dependencies = [
[[package]] [[package]]
name = "cm-dashboard-shared" name = "cm-dashboard-shared"
version = "0.1.40" version = "0.1.43"
dependencies = [ dependencies = [
"chrono", "chrono",
"serde", "serde",

View File

@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard-agent" name = "cm-dashboard-agent"
version = "0.1.42" version = "0.1.43"
edition = "2021" edition = "2021"
[dependencies] [dependencies]

View File

@ -8,6 +8,7 @@ use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
use crate::config::AgentConfig; use crate::config::AgentConfig;
use crate::metrics::MetricCollectionManager; use crate::metrics::MetricCollectionManager;
use crate::notifications::NotificationManager; use crate::notifications::NotificationManager;
use crate::service_tracker::UserStoppedServiceTracker;
use crate::status::HostStatusManager; use crate::status::HostStatusManager;
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status}; use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
@ -18,6 +19,7 @@ pub struct Agent {
metric_manager: MetricCollectionManager, metric_manager: MetricCollectionManager,
notification_manager: NotificationManager, notification_manager: NotificationManager,
host_status_manager: HostStatusManager, host_status_manager: HostStatusManager,
service_tracker: UserStoppedServiceTracker,
} }
impl Agent { impl Agent {
@ -50,6 +52,10 @@ impl Agent {
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone()); let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
info!("Host status manager initialized"); info!("Host status manager initialized");
// Initialize user-stopped service tracker
let service_tracker = UserStoppedServiceTracker::init_global()?;
info!("User-stopped service tracker initialized");
Ok(Self { Ok(Self {
hostname, hostname,
config, config,
@ -57,6 +63,7 @@ impl Agent {
metric_manager, metric_manager,
notification_manager, notification_manager,
host_status_manager, host_status_manager,
service_tracker,
}) })
} }
@ -271,13 +278,38 @@ impl Agent {
/// Handle systemd service control commands /// Handle systemd service control commands
async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> { async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> {
let action_str = match action { let (action_str, is_user_action) = match action {
ServiceAction::Start => "start", ServiceAction::Start => ("start", false),
ServiceAction::Stop => "stop", ServiceAction::Stop => ("stop", false),
ServiceAction::Status => "status", ServiceAction::Status => ("status", false),
ServiceAction::UserStart => ("start", true),
ServiceAction::UserStop => ("stop", true),
}; };
info!("Executing systemctl {} {}", action_str, service_name); info!("Executing systemctl {} {} (user action: {})", action_str, service_name, is_user_action);
// Handle user-stopped service tracking before systemctl execution
match action {
ServiceAction::UserStop => {
info!("Marking service '{}' as user-stopped", service_name);
if let Err(e) = self.service_tracker.mark_user_stopped(service_name) {
error!("Failed to mark service as user-stopped: {}", e);
} else {
// Sync to global tracker
UserStoppedServiceTracker::update_global(&self.service_tracker);
}
}
ServiceAction::UserStart => {
info!("Clearing user-stopped flag for service '{}'", service_name);
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
error!("Failed to clear user-stopped flag: {}", e);
} else {
// Sync to global tracker
UserStoppedServiceTracker::update_global(&self.service_tracker);
}
}
_ => {}
}
let output = tokio::process::Command::new("sudo") let output = tokio::process::Command::new("sudo")
.arg("systemctl") .arg("systemctl")
@ -298,7 +330,7 @@ impl Agent {
} }
// Force refresh metrics after service control to update service status // Force refresh metrics after service control to update service status
if matches!(action, ServiceAction::Start | ServiceAction::Stop) { if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::UserStart | ServiceAction::UserStop) {
info!("Triggering immediate metric refresh after service control"); info!("Triggering immediate metric refresh after service control");
if let Err(e) = self.collect_metrics_only().await { if let Err(e) = self.collect_metrics_only().await {
error!("Failed to refresh metrics after service control: {}", e); error!("Failed to refresh metrics after service control: {}", e);

View File

@ -8,6 +8,7 @@ use tracing::debug;
use super::{Collector, CollectorError}; use super::{Collector, CollectorError};
use crate::config::SystemdConfig; use crate::config::SystemdConfig;
use crate::service_tracker::UserStoppedServiceTracker;
/// Systemd collector for monitoring systemd services /// Systemd collector for monitoring systemd services
pub struct SystemdCollector { pub struct SystemdCollector {
@ -353,11 +354,19 @@ impl SystemdCollector {
Ok((active_status, detailed_info)) Ok((active_status, detailed_info))
} }
/// Calculate service status /// Calculate service status, taking user-stopped services into account
fn calculate_service_status(&self, active_status: &str) -> Status { fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
match active_status.to_lowercase().as_str() { match active_status.to_lowercase().as_str() {
"active" => Status::Ok, "active" => Status::Ok,
"inactive" | "dead" => Status::Warning, "inactive" | "dead" => {
// Check if this service was stopped by user action
if UserStoppedServiceTracker::is_service_user_stopped(service_name) {
debug!("Service '{}' is inactive but marked as user-stopped - treating as OK", service_name);
Status::Ok
} else {
Status::Warning
}
},
"failed" | "error" => Status::Critical, "failed" | "error" => Status::Critical,
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending, "activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending,
_ => Status::Unknown, _ => Status::Unknown,
@ -480,7 +489,7 @@ impl Collector for SystemdCollector {
for service in &monitored_services { for service in &monitored_services {
match self.get_service_status(service) { match self.get_service_status(service) {
Ok((active_status, _detailed_info)) => { Ok((active_status, _detailed_info)) => {
let status = self.calculate_service_status(&active_status); let status = self.calculate_service_status(service, &active_status);
// Individual service status metric // Individual service status metric
metrics.push(Metric { metrics.push(Metric {

View File

@ -113,4 +113,6 @@ pub enum ServiceAction {
Start, Start,
Stop, Stop,
Status, Status,
UserStart, // User-initiated start (clears user-stopped flag)
UserStop, // User-initiated stop (marks as user-stopped)
} }

View File

@ -9,6 +9,7 @@ mod communication;
mod config; mod config;
mod metrics; mod metrics;
mod notifications; mod notifications;
mod service_tracker;
mod status; mod status;
use agent::Agent; use agent::Agent;

View File

@ -0,0 +1,172 @@
use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashSet;
use std::fs;
use std::path::Path;
use std::sync::{Arc, Mutex, OnceLock};
use tracing::{debug, info, warn};
/// Shared instance for global access
static GLOBAL_TRACKER: OnceLock<Arc<Mutex<UserStoppedServiceTracker>>> = OnceLock::new();
/// Tracks services that have been stopped by user action
/// These services should be treated as OK status instead of Warning
#[derive(Debug)]
pub struct UserStoppedServiceTracker {
/// Set of services stopped by user action
user_stopped_services: HashSet<String>,
/// Path to persistent storage file
storage_path: String,
}
/// Serializable data structure for persistence
#[derive(Debug, Serialize, Deserialize)]
struct UserStoppedData {
services: Vec<String>,
}
impl UserStoppedServiceTracker {
/// Create new tracker with default storage path
pub fn new() -> Self {
Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json")
}
/// Initialize global instance (called by agent)
pub fn init_global() -> Result<Self> {
let tracker = Self::new();
// Set global instance
let global_instance = Arc::new(Mutex::new(tracker));
if GLOBAL_TRACKER.set(global_instance).is_err() {
warn!("Global service tracker was already initialized");
}
// Return a new instance for the agent to use
Ok(Self::new())
}
/// Check if a service is user-stopped (global access for collectors)
pub fn is_service_user_stopped(service_name: &str) -> bool {
if let Some(global) = GLOBAL_TRACKER.get() {
if let Ok(tracker) = global.lock() {
tracker.is_user_stopped(service_name)
} else {
debug!("Failed to lock global service tracker");
false
}
} else {
debug!("Global service tracker not initialized");
false
}
}
/// Update global tracker (called by agent when tracker state changes)
pub fn update_global(updated_tracker: &UserStoppedServiceTracker) {
if let Some(global) = GLOBAL_TRACKER.get() {
if let Ok(mut tracker) = global.lock() {
tracker.user_stopped_services = updated_tracker.user_stopped_services.clone();
} else {
debug!("Failed to lock global service tracker for update");
}
} else {
debug!("Global service tracker not initialized for update");
}
}
/// Create new tracker with custom storage path
pub fn with_storage_path<P: AsRef<Path>>(storage_path: P) -> Self {
let storage_path = storage_path.as_ref().to_string_lossy().to_string();
let mut tracker = Self {
user_stopped_services: HashSet::new(),
storage_path,
};
// Load existing data from storage
if let Err(e) = tracker.load_from_storage() {
warn!("Failed to load user-stopped services from storage: {}", e);
info!("Starting with empty user-stopped services list");
}
tracker
}
/// Mark a service as user-stopped
pub fn mark_user_stopped(&mut self, service_name: &str) -> Result<()> {
info!("Marking service '{}' as user-stopped", service_name);
self.user_stopped_services.insert(service_name.to_string());
self.save_to_storage()?;
debug!("Service '{}' marked as user-stopped and saved to storage", service_name);
Ok(())
}
/// Clear user-stopped flag for a service (when user starts it)
pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> {
if self.user_stopped_services.remove(service_name) {
info!("Cleared user-stopped flag for service '{}'", service_name);
self.save_to_storage()?;
debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name);
} else {
debug!("Service '{}' was not marked as user-stopped", service_name);
}
Ok(())
}
/// Check if a service is marked as user-stopped
pub fn is_user_stopped(&self, service_name: &str) -> bool {
let is_stopped = self.user_stopped_services.contains(service_name);
debug!("Service '{}' user-stopped status: {}", service_name, is_stopped);
is_stopped
}
/// Save current state to persistent storage
fn save_to_storage(&self) -> Result<()> {
// Create parent directory if it doesn't exist
if let Some(parent_dir) = Path::new(&self.storage_path).parent() {
if !parent_dir.exists() {
fs::create_dir_all(parent_dir)?;
debug!("Created parent directory: {}", parent_dir.display());
}
}
let data = UserStoppedData {
services: self.user_stopped_services.iter().cloned().collect(),
};
let json_data = serde_json::to_string_pretty(&data)?;
fs::write(&self.storage_path, json_data)?;
debug!(
"Saved {} user-stopped services to {}",
data.services.len(),
self.storage_path
);
Ok(())
}
/// Load state from persistent storage
fn load_from_storage(&mut self) -> Result<()> {
if !Path::new(&self.storage_path).exists() {
debug!("Storage file {} does not exist, starting fresh", self.storage_path);
return Ok(());
}
let json_data = fs::read_to_string(&self.storage_path)?;
let data: UserStoppedData = serde_json::from_str(&json_data)?;
self.user_stopped_services = data.services.into_iter().collect();
info!(
"Loaded {} user-stopped services from {}",
self.user_stopped_services.len(),
self.storage_path
);
if !self.user_stopped_services.is_empty() {
debug!("User-stopped services: {:?}", self.user_stopped_services);
}
Ok(())
}
}

View File

@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard" name = "cm-dashboard"
version = "0.1.42" version = "0.1.43"
edition = "2021" edition = "2021"
[dependencies] [dependencies]

View File

@ -295,18 +295,18 @@ impl Dashboard {
async fn execute_ui_command(&self, command: UiCommand) -> Result<()> { async fn execute_ui_command(&self, command: UiCommand) -> Result<()> {
match command { match command {
UiCommand::ServiceStart { hostname, service_name } => { UiCommand::ServiceStart { hostname, service_name } => {
info!("Sending start command for service {} on {}", service_name, hostname); info!("Sending user start command for service {} on {}", service_name, hostname);
let agent_command = AgentCommand::ServiceControl { let agent_command = AgentCommand::ServiceControl {
service_name: service_name.clone(), service_name: service_name.clone(),
action: ServiceAction::Start, action: ServiceAction::UserStart,
}; };
self.zmq_command_sender.send_command(&hostname, agent_command).await?; self.zmq_command_sender.send_command(&hostname, agent_command).await?;
} }
UiCommand::ServiceStop { hostname, service_name } => { UiCommand::ServiceStop { hostname, service_name } => {
info!("Sending stop command for service {} on {}", service_name, hostname); info!("Sending user stop command for service {} on {}", service_name, hostname);
let agent_command = AgentCommand::ServiceControl { let agent_command = AgentCommand::ServiceControl {
service_name: service_name.clone(), service_name: service_name.clone(),
action: ServiceAction::Stop, action: ServiceAction::UserStop,
}; };
self.zmq_command_sender.send_command(&hostname, agent_command).await?; self.zmq_command_sender.send_command(&hostname, agent_command).await?;
} }

View File

@ -36,6 +36,8 @@ pub enum ServiceAction {
Start, Start,
Stop, Stop,
Status, Status,
UserStart, // User-initiated start (clears user-stopped flag)
UserStop, // User-initiated stop (marks as user-stopped)
} }
/// ZMQ consumer for receiving metrics from agents /// ZMQ consumer for receiving metrics from agents

View File

@ -12,10 +12,6 @@ mod ui;
use app::Dashboard; use app::Dashboard;
/// Get hardcoded version
fn get_version() -> &'static str {
"v0.1.42"
}
/// Check if running inside tmux session /// Check if running inside tmux session
fn check_tmux_session() { fn check_tmux_session() {
@ -42,7 +38,7 @@ fn check_tmux_session() {
#[derive(Parser)] #[derive(Parser)]
#[command(name = "cm-dashboard")] #[command(name = "cm-dashboard")]
#[command(about = "CM Dashboard TUI with individual metric consumption")] #[command(about = "CM Dashboard TUI with individual metric consumption")]
#[command(version = get_version())] #[command(version)]
struct Cli { struct Cli {
/// Increase logging verbosity (-v, -vv) /// Increase logging verbosity (-v, -vv)
#[arg(short, long, action = clap::ArgAction::Count)] #[arg(short, long, action = clap::ArgAction::Count)]

View File

@ -1,6 +1,6 @@
[package] [package]
name = "cm-dashboard-shared" name = "cm-dashboard-shared"
version = "0.1.42" version = "0.1.43"
edition = "2021" edition = "2021"
[dependencies] [dependencies]