Some checks failed
Build and Release / build-and-release (push) Has been cancelled
Simplifies host connection configuration by removing tailscale_ip field, connection_type preferences, and fallback retry logic. Now uses only the ip field or hostname as fallback. Eliminates blocking TCP connectivity tests that interfered with heartbeat processing. This resolves intermittent host lost/found issues by removing the connection retry timeouts that blocked the ZMQ message processing loop.
421 lines
17 KiB
Rust
421 lines
17 KiB
Rust
use anyhow::Result;
|
|
use gethostname::gethostname;
|
|
use std::time::Duration;
|
|
use tokio::time::interval;
|
|
use tracing::{debug, error, info};
|
|
|
|
use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
|
|
use crate::config::AgentConfig;
|
|
use crate::metrics::MetricCollectionManager;
|
|
use crate::notifications::NotificationManager;
|
|
use crate::service_tracker::UserStoppedServiceTracker;
|
|
use crate::status::HostStatusManager;
|
|
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
|
|
|
pub struct Agent {
|
|
hostname: String,
|
|
config: AgentConfig,
|
|
zmq_handler: ZmqHandler,
|
|
metric_manager: MetricCollectionManager,
|
|
notification_manager: NotificationManager,
|
|
host_status_manager: HostStatusManager,
|
|
service_tracker: UserStoppedServiceTracker,
|
|
}
|
|
|
|
impl Agent {
|
|
pub async fn new(config_path: Option<String>) -> Result<Self> {
|
|
let hostname = gethostname().to_string_lossy().to_string();
|
|
info!("Initializing agent for host: {}", hostname);
|
|
|
|
// Load configuration (now required)
|
|
let config_path = config_path.ok_or_else(|| anyhow::anyhow!("Configuration file path is required"))?;
|
|
let config = AgentConfig::from_file(&config_path)?;
|
|
|
|
info!("Agent configuration loaded");
|
|
|
|
// Initialize ZMQ communication
|
|
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
|
info!(
|
|
"ZMQ communication initialized on port {}",
|
|
config.zmq.publisher_port
|
|
);
|
|
|
|
// Initialize metric collection manager with cache config
|
|
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
|
info!("Metric collection manager initialized");
|
|
|
|
// Initialize notification manager
|
|
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
|
info!("Notification manager initialized");
|
|
|
|
// Initialize host status manager
|
|
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
|
info!("Host status manager initialized");
|
|
|
|
// Initialize user-stopped service tracker
|
|
let service_tracker = UserStoppedServiceTracker::init_global()?;
|
|
info!("User-stopped service tracker initialized");
|
|
|
|
Ok(Self {
|
|
hostname,
|
|
config,
|
|
zmq_handler,
|
|
metric_manager,
|
|
notification_manager,
|
|
host_status_manager,
|
|
service_tracker,
|
|
})
|
|
}
|
|
|
|
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
|
info!("Starting agent main loop with separated collection and transmission");
|
|
|
|
// CRITICAL: Collect ALL data immediately at startup before entering the loop
|
|
info!("Performing initial FORCE collection of all metrics at startup");
|
|
if let Err(e) = self.collect_all_metrics_force().await {
|
|
error!("Failed to collect initial metrics: {}", e);
|
|
} else {
|
|
info!("Initial metric collection completed - all data cached and ready");
|
|
}
|
|
|
|
// Separate intervals for collection, transmission, heartbeat, and email notifications
|
|
let mut collection_interval =
|
|
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
|
let mut transmission_interval = interval(Duration::from_secs(self.config.zmq.transmission_interval_seconds));
|
|
let mut heartbeat_interval = interval(Duration::from_secs(self.config.zmq.heartbeat_interval_seconds));
|
|
let mut notification_interval = interval(Duration::from_secs(self.config.notifications.aggregation_interval_seconds));
|
|
|
|
loop {
|
|
tokio::select! {
|
|
_ = collection_interval.tick() => {
|
|
// Only collect and cache metrics, no ZMQ transmission
|
|
if let Err(e) = self.collect_metrics_only().await {
|
|
error!("Failed to collect metrics: {}", e);
|
|
}
|
|
}
|
|
_ = transmission_interval.tick() => {
|
|
// Send all metrics via ZMQ (dashboard updates only)
|
|
if let Err(e) = self.broadcast_all_metrics().await {
|
|
error!("Failed to broadcast metrics: {}", e);
|
|
}
|
|
}
|
|
_ = heartbeat_interval.tick() => {
|
|
// Send standalone heartbeat for host connectivity detection
|
|
if let Err(e) = self.send_heartbeat().await {
|
|
error!("Failed to send heartbeat: {}", e);
|
|
}
|
|
}
|
|
_ = notification_interval.tick() => {
|
|
// Process batched email notifications (separate from dashboard updates)
|
|
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
|
error!("Failed to process pending notifications: {}", e);
|
|
}
|
|
}
|
|
// Handle incoming commands (check periodically)
|
|
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
|
if let Err(e) = self.handle_commands().await {
|
|
error!("Error handling commands: {}", e);
|
|
}
|
|
}
|
|
_ = &mut shutdown_rx => {
|
|
info!("Shutdown signal received, stopping agent loop");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Agent main loop stopped");
|
|
Ok(())
|
|
}
|
|
|
|
async fn collect_all_metrics_force(&mut self) -> Result<()> {
|
|
info!("Starting FORCE metric collection for startup");
|
|
|
|
// Force collect all metrics from all collectors immediately
|
|
let metrics = self.metric_manager.collect_all_metrics_force().await?;
|
|
|
|
if metrics.is_empty() {
|
|
error!("No metrics collected during force collection!");
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Force collected and cached {} metrics", metrics.len());
|
|
|
|
// Process metrics through status manager (collect status data at startup)
|
|
let _status_changed = self.process_metrics(&metrics).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn collect_metrics_only(&mut self) -> Result<()> {
|
|
debug!("Starting metric collection cycle (cache only)");
|
|
|
|
// Collect all metrics from all collectors and cache them
|
|
let metrics = self.metric_manager.collect_all_metrics().await?;
|
|
|
|
if metrics.is_empty() {
|
|
debug!("No metrics collected this cycle");
|
|
return Ok(());
|
|
}
|
|
|
|
debug!("Collected and cached {} metrics", metrics.len());
|
|
|
|
// Process metrics through status manager and trigger immediate transmission if status changed
|
|
let status_changed = self.process_metrics(&metrics).await;
|
|
|
|
if status_changed {
|
|
info!("Status change detected - triggering immediate metric transmission");
|
|
if let Err(e) = self.broadcast_all_metrics().await {
|
|
error!("Failed to broadcast metrics after status change: {}", e);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn broadcast_all_metrics(&mut self) -> Result<()> {
|
|
debug!("Broadcasting cached metrics via ZMQ");
|
|
|
|
// Get cached metrics (no fresh collection)
|
|
let mut metrics = self.metric_manager.get_cached_metrics();
|
|
|
|
// Add the host status summary metric from status manager
|
|
let host_status_metric = self.host_status_manager.get_host_status_metric();
|
|
metrics.push(host_status_metric);
|
|
|
|
// Add agent version metric for cross-host version comparison
|
|
let version_metric = self.get_agent_version_metric();
|
|
metrics.push(version_metric);
|
|
|
|
// Add heartbeat metric for host connectivity detection
|
|
let heartbeat_metric = self.get_heartbeat_metric();
|
|
metrics.push(heartbeat_metric);
|
|
|
|
// Check for user-stopped services that are now active and clear their flags
|
|
self.clear_user_stopped_flags_for_active_services(&metrics);
|
|
|
|
if metrics.is_empty() {
|
|
debug!("No metrics to broadcast");
|
|
return Ok(());
|
|
}
|
|
|
|
debug!("Broadcasting {} cached metrics (including host status summary)", metrics.len());
|
|
|
|
// Create and send message with all current data
|
|
let message = MetricMessage::new(self.hostname.clone(), metrics);
|
|
self.zmq_handler.publish_metrics(&message).await?;
|
|
|
|
debug!("Metrics broadcasted successfully");
|
|
Ok(())
|
|
}
|
|
|
|
async fn process_metrics(&mut self, metrics: &[Metric]) -> bool {
|
|
let mut status_changed = false;
|
|
for metric in metrics {
|
|
// Filter excluded metrics from email notification processing only
|
|
if self.config.notifications.exclude_email_metrics.contains(&metric.name) {
|
|
debug!("Excluding metric '{}' from email notification processing", metric.name);
|
|
continue;
|
|
}
|
|
|
|
if self.host_status_manager.process_metric(metric, &mut self.notification_manager).await {
|
|
status_changed = true;
|
|
}
|
|
}
|
|
status_changed
|
|
}
|
|
|
|
/// Create agent version metric for cross-host version comparison
|
|
fn get_agent_version_metric(&self) -> Metric {
|
|
// Get version from executable path (same logic as main.rs get_version)
|
|
let version = self.get_agent_version();
|
|
|
|
Metric::new(
|
|
"agent_version".to_string(),
|
|
MetricValue::String(version),
|
|
Status::Ok,
|
|
)
|
|
}
|
|
|
|
/// Get agent version from Cargo package version
|
|
fn get_agent_version(&self) -> String {
|
|
// Use the version from Cargo.toml (e.g., "0.1.11")
|
|
format!("v{}", env!("CARGO_PKG_VERSION"))
|
|
}
|
|
|
|
/// Create heartbeat metric for host connectivity detection
|
|
fn get_heartbeat_metric(&self) -> Metric {
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
|
|
let timestamp = SystemTime::now()
|
|
.duration_since(UNIX_EPOCH)
|
|
.unwrap()
|
|
.as_secs();
|
|
|
|
Metric::new(
|
|
"agent_heartbeat".to_string(),
|
|
MetricValue::Integer(timestamp as i64),
|
|
Status::Ok,
|
|
)
|
|
}
|
|
|
|
/// Send standalone heartbeat for connectivity detection
|
|
async fn send_heartbeat(&mut self) -> Result<()> {
|
|
let heartbeat_metric = self.get_heartbeat_metric();
|
|
let message = MetricMessage::new(
|
|
self.hostname.clone(),
|
|
vec![heartbeat_metric],
|
|
);
|
|
|
|
self.zmq_handler.publish_metrics(&message).await?;
|
|
debug!("Sent standalone heartbeat for connectivity detection");
|
|
Ok(())
|
|
}
|
|
|
|
async fn handle_commands(&mut self) -> Result<()> {
|
|
// Try to receive commands (non-blocking)
|
|
match self.zmq_handler.try_receive_command() {
|
|
Ok(Some(command)) => {
|
|
info!("Received command: {:?}", command);
|
|
self.process_command(command).await?;
|
|
}
|
|
Ok(None) => {
|
|
// No command available - this is normal
|
|
}
|
|
Err(e) => {
|
|
error!("Error receiving command: {}", e);
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
|
|
match command {
|
|
AgentCommand::CollectNow => {
|
|
info!("Processing CollectNow command");
|
|
if let Err(e) = self.collect_metrics_only().await {
|
|
error!("Failed to collect metrics on command: {}", e);
|
|
}
|
|
}
|
|
AgentCommand::SetInterval { seconds } => {
|
|
info!("Processing SetInterval command: {} seconds", seconds);
|
|
// Note: This would require modifying the interval, which is complex
|
|
// For now, just log the request
|
|
info!("Interval change requested but not implemented yet");
|
|
}
|
|
AgentCommand::ToggleCollector { name, enabled } => {
|
|
info!(
|
|
"Processing ToggleCollector command: {} -> {}",
|
|
name, enabled
|
|
);
|
|
// Note: This would require dynamic collector management
|
|
info!("Collector toggle requested but not implemented yet");
|
|
}
|
|
AgentCommand::Ping => {
|
|
info!("Processing Ping command - agent is alive");
|
|
// Could send a response back via ZMQ if needed
|
|
}
|
|
AgentCommand::ServiceControl { service_name, action } => {
|
|
info!("Processing ServiceControl command: {} {:?}", service_name, action);
|
|
if let Err(e) = self.handle_service_control(&service_name, &action).await {
|
|
error!("Failed to execute service control: {}", e);
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Handle systemd service control commands
|
|
async fn handle_service_control(&mut self, service_name: &str, action: &ServiceAction) -> Result<()> {
|
|
let (action_str, is_user_action) = match action {
|
|
ServiceAction::Start => ("start", false),
|
|
ServiceAction::Stop => ("stop", false),
|
|
ServiceAction::Status => ("status", false),
|
|
ServiceAction::UserStart => ("start", true),
|
|
ServiceAction::UserStop => ("stop", true),
|
|
};
|
|
|
|
info!("Executing systemctl {} {} (user action: {})", action_str, service_name, is_user_action);
|
|
|
|
// Handle user-stopped service tracking before systemctl execution (stop only)
|
|
match action {
|
|
ServiceAction::UserStop => {
|
|
info!("Marking service '{}' as user-stopped", service_name);
|
|
if let Err(e) = self.service_tracker.mark_user_stopped(service_name) {
|
|
error!("Failed to mark service as user-stopped: {}", e);
|
|
} else {
|
|
// Sync to global tracker
|
|
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
// Spawn the systemctl command asynchronously to avoid blocking the agent
|
|
let service_name_clone = service_name.to_string();
|
|
let action_str_clone = action_str.to_string();
|
|
|
|
tokio::spawn(async move {
|
|
let result = tokio::process::Command::new("sudo")
|
|
.arg("systemctl")
|
|
.arg(&action_str_clone)
|
|
.arg(format!("{}.service", service_name_clone))
|
|
.output()
|
|
.await;
|
|
|
|
match result {
|
|
Ok(output) => {
|
|
if output.status.success() {
|
|
info!("Service {} {} completed successfully", service_name_clone, action_str_clone);
|
|
if !output.stdout.is_empty() {
|
|
debug!("stdout: {}", String::from_utf8_lossy(&output.stdout));
|
|
}
|
|
} else {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
error!("Service {} {} failed: {}", service_name_clone, action_str_clone, stderr);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to execute systemctl {} {}: {}", action_str_clone, service_name_clone, e);
|
|
}
|
|
}
|
|
});
|
|
|
|
info!("Service {} {} command initiated (non-blocking)", service_name, action_str);
|
|
|
|
// Note: Service status will be updated by the normal metric collection cycle
|
|
// once the systemctl operation completes
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Check metrics for user-stopped services that are now active and clear their flags
|
|
fn clear_user_stopped_flags_for_active_services(&mut self, metrics: &[Metric]) {
|
|
for metric in metrics {
|
|
// Look for service status metrics that are active
|
|
if metric.name.starts_with("service_") && metric.name.ends_with("_status") {
|
|
if let MetricValue::String(status) = &metric.value {
|
|
if status == "active" {
|
|
// Extract service name from metric name (service_nginx_status -> nginx)
|
|
let service_name = metric.name
|
|
.strip_prefix("service_")
|
|
.and_then(|s| s.strip_suffix("_status"))
|
|
.unwrap_or("");
|
|
|
|
if !service_name.is_empty() && UserStoppedServiceTracker::is_service_user_stopped(service_name) {
|
|
info!("Service '{}' is now active - clearing user-stopped flag", service_name);
|
|
if let Err(e) = self.service_tracker.clear_user_stopped(service_name) {
|
|
error!("Failed to clear user-stopped flag for '{}': {}", service_name, e);
|
|
} else {
|
|
// Sync to global tracker
|
|
UserStoppedServiceTracker::update_global(&self.service_tracker);
|
|
debug!("Cleared user-stopped flag for service '{}'", service_name);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
} |