Implement user-stopped service tracking system
All checks were successful
Build and Release / build-and-release (push) Successful in 2m34s
All checks were successful
Build and Release / build-and-release (push) Successful in 2m34s
Add comprehensive tracking for services stopped via dashboard to prevent false alerts when users intentionally stop services. Features: - User-stopped services report Status::Ok instead of Warning - Persistent storage survives agent restarts - Dashboard sends UserStart/UserStop commands - Agent tracks and syncs user-stopped state globally - Systemd collector respects user-stopped flags Implementation: - New service_tracker module with persistent JSON storage - Enhanced ServiceAction enum with UserStart/UserStop variants - Global singleton tracker accessible by collectors - Service status logic updated to check user-stopped flag - Dashboard version now uses CARGO_PKG_VERSION automatically Bump version to v0.1.43
This commit is contained in:
parent
c8f800a1e5
commit
c56e9d7be2
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -270,7 +270,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.40"
|
||||
version = "0.1.43"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@ -291,7 +291,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.40"
|
||||
version = "0.1.43"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@ -314,7 +314,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.40"
|
||||
version = "0.1.43"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.42"
|
||||
version = "0.1.43"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -8,6 +8,7 @@ use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
|
||||
use crate::config::AgentConfig;
|
||||
use crate::metrics::MetricCollectionManager;
|
||||
use crate::notifications::NotificationManager;
|
||||
use crate::service_tracker::UserStoppedServiceTracker;
|
||||
use crate::status::HostStatusManager;
|
||||
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
||||
|
||||
@ -18,6 +19,7 @@ pub struct Agent {
|
||||
metric_manager: MetricCollectionManager,
|
||||
notification_manager: NotificationManager,
|
||||
host_status_manager: HostStatusManager,
|
||||
service_tracker: UserStoppedServiceTracker,
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
@ -50,6 +52,10 @@ impl Agent {
|
||||
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
||||
info!("Host status manager initialized");
|
||||
|
||||
// Initialize user-stopped service tracker
|
||||
let service_tracker = UserStoppedServiceTracker::init_global()?;
|
||||
info!("User-stopped service tracker initialized");
|
||||
|
||||
Ok(Self {
|
||||
hostname,
|
||||
config,
|
||||
@ -57,6 +63,7 @@ impl Agent {
|
||||
metric_manager,
|
||||
notification_manager,
|
||||
host_status_manager,
|
||||
service_tracker,
|
||||
})
|
||||
}
|
||||
|
||||
@ -271,13 +278,38 @@ impl Agent {
|
||||
|
||||
/// Handle systemd service control commands
|
||||
async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> {
|
||||
let action_str = match action {
|
||||
ServiceAction::Start => "start",
|
||||
ServiceAction::Stop => "stop",
|
||||
ServiceAction::Status => "status",
|
||||
let (action_str, is_user_action) = match action {
|
||||
ServiceAction::Start => ("start", false),
|
||||
ServiceAction::Stop => ("stop", false),
|
||||
ServiceAction::Status => ("status", false),
|
||||
ServiceAction::UserStart => ("start", true),
|
||||
ServiceAction::UserStop => ("stop", true),
|
||||
};
|
||||
|
||||
info!("Executing systemctl {} {}", action_str, service_name);
|
||||
info!("Executing systemctl {} {} (user action: {})", action_str, service_name, is_user_action);
|
||||
|
||||
// Handle user-stopped service tracking before systemctl execution
|
||||
match action {
|
||||
ServiceAction::UserStop => {
|
||||
info!("Marking service '{}' as user-stopped", service_name);
|
||||
if let Err(e) = self.service_tracker.mark_user_stopped(service_name) {
|
||||
error!("Failed to mark service as user-stopped: {}", e);
|
||||
} else {
|
||||
// Sync to global tracker
|
||||
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
||||
}
|
||||
}
|
||||
ServiceAction::UserStart => {
|
||||
info!("Clearing user-stopped flag for service '{}'", service_name);
|
||||
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
|
||||
error!("Failed to clear user-stopped flag: {}", e);
|
||||
} else {
|
||||
// Sync to global tracker
|
||||
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let output = tokio::process::Command::new("sudo")
|
||||
.arg("systemctl")
|
||||
@ -298,7 +330,7 @@ impl Agent {
|
||||
}
|
||||
|
||||
// Force refresh metrics after service control to update service status
|
||||
if matches!(action, ServiceAction::Start | ServiceAction::Stop) {
|
||||
if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::UserStart | ServiceAction::UserStop) {
|
||||
info!("Triggering immediate metric refresh after service control");
|
||||
if let Err(e) = self.collect_metrics_only().await {
|
||||
error!("Failed to refresh metrics after service control: {}", e);
|
||||
|
||||
@ -8,6 +8,7 @@ use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError};
|
||||
use crate::config::SystemdConfig;
|
||||
use crate::service_tracker::UserStoppedServiceTracker;
|
||||
|
||||
/// Systemd collector for monitoring systemd services
|
||||
pub struct SystemdCollector {
|
||||
@ -353,11 +354,19 @@ impl SystemdCollector {
|
||||
Ok((active_status, detailed_info))
|
||||
}
|
||||
|
||||
/// Calculate service status
|
||||
fn calculate_service_status(&self, active_status: &str) -> Status {
|
||||
/// Calculate service status, taking user-stopped services into account
|
||||
fn calculate_service_status(&self, service_name: &str, active_status: &str) -> Status {
|
||||
match active_status.to_lowercase().as_str() {
|
||||
"active" => Status::Ok,
|
||||
"inactive" | "dead" => Status::Warning,
|
||||
"inactive" | "dead" => {
|
||||
// Check if this service was stopped by user action
|
||||
if UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
||||
debug!("Service '{}' is inactive but marked as user-stopped - treating as OK", service_name);
|
||||
Status::Ok
|
||||
} else {
|
||||
Status::Warning
|
||||
}
|
||||
},
|
||||
"failed" | "error" => Status::Critical,
|
||||
"activating" | "deactivating" | "reloading" | "start" | "stop" | "restart" => Status::Pending,
|
||||
_ => Status::Unknown,
|
||||
@ -480,7 +489,7 @@ impl Collector for SystemdCollector {
|
||||
for service in &monitored_services {
|
||||
match self.get_service_status(service) {
|
||||
Ok((active_status, _detailed_info)) => {
|
||||
let status = self.calculate_service_status(&active_status);
|
||||
let status = self.calculate_service_status(service, &active_status);
|
||||
|
||||
// Individual service status metric
|
||||
metrics.push(Metric {
|
||||
|
||||
@ -113,4 +113,6 @@ pub enum ServiceAction {
|
||||
Start,
|
||||
Stop,
|
||||
Status,
|
||||
UserStart, // User-initiated start (clears user-stopped flag)
|
||||
UserStop, // User-initiated stop (marks as user-stopped)
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@ mod communication;
|
||||
mod config;
|
||||
mod metrics;
|
||||
mod notifications;
|
||||
mod service_tracker;
|
||||
mod status;
|
||||
|
||||
use agent::Agent;
|
||||
|
||||
172
agent/src/service_tracker.rs
Normal file
172
agent/src/service_tracker.rs
Normal file
@ -0,0 +1,172 @@
|
||||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Shared instance for global access
|
||||
static GLOBAL_TRACKER: OnceLock<Arc<Mutex<UserStoppedServiceTracker>>> = OnceLock::new();
|
||||
|
||||
/// Tracks services that have been stopped by user action
|
||||
/// These services should be treated as OK status instead of Warning
|
||||
#[derive(Debug)]
|
||||
pub struct UserStoppedServiceTracker {
|
||||
/// Set of services stopped by user action
|
||||
user_stopped_services: HashSet<String>,
|
||||
/// Path to persistent storage file
|
||||
storage_path: String,
|
||||
}
|
||||
|
||||
/// Serializable data structure for persistence
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct UserStoppedData {
|
||||
services: Vec<String>,
|
||||
}
|
||||
|
||||
impl UserStoppedServiceTracker {
|
||||
/// Create new tracker with default storage path
|
||||
pub fn new() -> Self {
|
||||
Self::with_storage_path("/var/lib/cm-dashboard/user-stopped-services.json")
|
||||
}
|
||||
|
||||
/// Initialize global instance (called by agent)
|
||||
pub fn init_global() -> Result<Self> {
|
||||
let tracker = Self::new();
|
||||
|
||||
// Set global instance
|
||||
let global_instance = Arc::new(Mutex::new(tracker));
|
||||
if GLOBAL_TRACKER.set(global_instance).is_err() {
|
||||
warn!("Global service tracker was already initialized");
|
||||
}
|
||||
|
||||
// Return a new instance for the agent to use
|
||||
Ok(Self::new())
|
||||
}
|
||||
|
||||
/// Check if a service is user-stopped (global access for collectors)
|
||||
pub fn is_service_user_stopped(service_name: &str) -> bool {
|
||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||
if let Ok(tracker) = global.lock() {
|
||||
tracker.is_user_stopped(service_name)
|
||||
} else {
|
||||
debug!("Failed to lock global service tracker");
|
||||
false
|
||||
}
|
||||
} else {
|
||||
debug!("Global service tracker not initialized");
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// Update global tracker (called by agent when tracker state changes)
|
||||
pub fn update_global(updated_tracker: &UserStoppedServiceTracker) {
|
||||
if let Some(global) = GLOBAL_TRACKER.get() {
|
||||
if let Ok(mut tracker) = global.lock() {
|
||||
tracker.user_stopped_services = updated_tracker.user_stopped_services.clone();
|
||||
} else {
|
||||
debug!("Failed to lock global service tracker for update");
|
||||
}
|
||||
} else {
|
||||
debug!("Global service tracker not initialized for update");
|
||||
}
|
||||
}
|
||||
|
||||
/// Create new tracker with custom storage path
|
||||
pub fn with_storage_path<P: AsRef<Path>>(storage_path: P) -> Self {
|
||||
let storage_path = storage_path.as_ref().to_string_lossy().to_string();
|
||||
let mut tracker = Self {
|
||||
user_stopped_services: HashSet::new(),
|
||||
storage_path,
|
||||
};
|
||||
|
||||
// Load existing data from storage
|
||||
if let Err(e) = tracker.load_from_storage() {
|
||||
warn!("Failed to load user-stopped services from storage: {}", e);
|
||||
info!("Starting with empty user-stopped services list");
|
||||
}
|
||||
|
||||
tracker
|
||||
}
|
||||
|
||||
/// Mark a service as user-stopped
|
||||
pub fn mark_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
||||
info!("Marking service '{}' as user-stopped", service_name);
|
||||
self.user_stopped_services.insert(service_name.to_string());
|
||||
self.save_to_storage()?;
|
||||
debug!("Service '{}' marked as user-stopped and saved to storage", service_name);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clear user-stopped flag for a service (when user starts it)
|
||||
pub fn clear_user_stopped(&mut self, service_name: &str) -> Result<()> {
|
||||
if self.user_stopped_services.remove(service_name) {
|
||||
info!("Cleared user-stopped flag for service '{}'", service_name);
|
||||
self.save_to_storage()?;
|
||||
debug!("Service '{}' user-stopped flag cleared and saved to storage", service_name);
|
||||
} else {
|
||||
debug!("Service '{}' was not marked as user-stopped", service_name);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if a service is marked as user-stopped
|
||||
pub fn is_user_stopped(&self, service_name: &str) -> bool {
|
||||
let is_stopped = self.user_stopped_services.contains(service_name);
|
||||
debug!("Service '{}' user-stopped status: {}", service_name, is_stopped);
|
||||
is_stopped
|
||||
}
|
||||
|
||||
|
||||
/// Save current state to persistent storage
|
||||
fn save_to_storage(&self) -> Result<()> {
|
||||
// Create parent directory if it doesn't exist
|
||||
if let Some(parent_dir) = Path::new(&self.storage_path).parent() {
|
||||
if !parent_dir.exists() {
|
||||
fs::create_dir_all(parent_dir)?;
|
||||
debug!("Created parent directory: {}", parent_dir.display());
|
||||
}
|
||||
}
|
||||
|
||||
let data = UserStoppedData {
|
||||
services: self.user_stopped_services.iter().cloned().collect(),
|
||||
};
|
||||
|
||||
let json_data = serde_json::to_string_pretty(&data)?;
|
||||
fs::write(&self.storage_path, json_data)?;
|
||||
|
||||
debug!(
|
||||
"Saved {} user-stopped services to {}",
|
||||
data.services.len(),
|
||||
self.storage_path
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load state from persistent storage
|
||||
fn load_from_storage(&mut self) -> Result<()> {
|
||||
if !Path::new(&self.storage_path).exists() {
|
||||
debug!("Storage file {} does not exist, starting fresh", self.storage_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let json_data = fs::read_to_string(&self.storage_path)?;
|
||||
let data: UserStoppedData = serde_json::from_str(&json_data)?;
|
||||
|
||||
self.user_stopped_services = data.services.into_iter().collect();
|
||||
|
||||
info!(
|
||||
"Loaded {} user-stopped services from {}",
|
||||
self.user_stopped_services.len(),
|
||||
self.storage_path
|
||||
);
|
||||
|
||||
if !self.user_stopped_services.is_empty() {
|
||||
debug!("User-stopped services: {:?}", self.user_stopped_services);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.42"
|
||||
version = "0.1.43"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@ -295,18 +295,18 @@ impl Dashboard {
|
||||
async fn execute_ui_command(&self, command: UiCommand) -> Result<()> {
|
||||
match command {
|
||||
UiCommand::ServiceStart { hostname, service_name } => {
|
||||
info!("Sending start command for service {} on {}", service_name, hostname);
|
||||
info!("Sending user start command for service {} on {}", service_name, hostname);
|
||||
let agent_command = AgentCommand::ServiceControl {
|
||||
service_name: service_name.clone(),
|
||||
action: ServiceAction::Start,
|
||||
action: ServiceAction::UserStart,
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
UiCommand::ServiceStop { hostname, service_name } => {
|
||||
info!("Sending stop command for service {} on {}", service_name, hostname);
|
||||
info!("Sending user stop command for service {} on {}", service_name, hostname);
|
||||
let agent_command = AgentCommand::ServiceControl {
|
||||
service_name: service_name.clone(),
|
||||
action: ServiceAction::Stop,
|
||||
action: ServiceAction::UserStop,
|
||||
};
|
||||
self.zmq_command_sender.send_command(&hostname, agent_command).await?;
|
||||
}
|
||||
|
||||
@ -36,6 +36,8 @@ pub enum ServiceAction {
|
||||
Start,
|
||||
Stop,
|
||||
Status,
|
||||
UserStart, // User-initiated start (clears user-stopped flag)
|
||||
UserStop, // User-initiated stop (marks as user-stopped)
|
||||
}
|
||||
|
||||
/// ZMQ consumer for receiving metrics from agents
|
||||
|
||||
@ -12,10 +12,6 @@ mod ui;
|
||||
|
||||
use app::Dashboard;
|
||||
|
||||
/// Get hardcoded version
|
||||
fn get_version() -> &'static str {
|
||||
"v0.1.42"
|
||||
}
|
||||
|
||||
/// Check if running inside tmux session
|
||||
fn check_tmux_session() {
|
||||
@ -42,7 +38,7 @@ fn check_tmux_session() {
|
||||
#[derive(Parser)]
|
||||
#[command(name = "cm-dashboard")]
|
||||
#[command(about = "CM Dashboard TUI with individual metric consumption")]
|
||||
#[command(version = get_version())]
|
||||
#[command(version)]
|
||||
struct Cli {
|
||||
/// Increase logging verbosity (-v, -vv)
|
||||
#[arg(short, long, action = clap::ArgAction::Count)]
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.42"
|
||||
version = "0.1.43"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user