All checks were successful
Build and Release / build-and-release (push) Successful in 1m24s
- Replace simulated progress messages with actual stdout/stderr capture - Stream all nixos-rebuild output line-by-line to terminal popup - Show transparent build process including downloads, compilation, and activation - Maintain real-time visibility into complete rebuild process
548 lines
22 KiB
Rust
548 lines
22 KiB
Rust
use anyhow::Result;
|
|
use gethostname::gethostname;
|
|
use std::time::Duration;
|
|
use tokio::time::interval;
|
|
use tracing::{debug, error, info};
|
|
|
|
use crate::communication::{AgentCommand, ServiceAction, ZmqHandler};
|
|
use crate::config::AgentConfig;
|
|
use crate::metrics::MetricCollectionManager;
|
|
use crate::notifications::NotificationManager;
|
|
use crate::status::HostStatusManager;
|
|
use cm_dashboard_shared::{CommandOutputMessage, Metric, MetricMessage, MetricValue, Status};
|
|
|
|
pub struct Agent {
|
|
hostname: String,
|
|
config: AgentConfig,
|
|
zmq_handler: ZmqHandler,
|
|
metric_manager: MetricCollectionManager,
|
|
notification_manager: NotificationManager,
|
|
host_status_manager: HostStatusManager,
|
|
}
|
|
|
|
impl Agent {
|
|
pub async fn new(config_path: Option<String>) -> Result<Self> {
|
|
let hostname = gethostname().to_string_lossy().to_string();
|
|
info!("Initializing agent for host: {}", hostname);
|
|
|
|
// Load configuration (now required)
|
|
let config_path = config_path.ok_or_else(|| anyhow::anyhow!("Configuration file path is required"))?;
|
|
let config = AgentConfig::from_file(&config_path)?;
|
|
|
|
info!("Agent configuration loaded");
|
|
|
|
// Initialize ZMQ communication
|
|
let zmq_handler = ZmqHandler::new(&config.zmq).await?;
|
|
info!(
|
|
"ZMQ communication initialized on port {}",
|
|
config.zmq.publisher_port
|
|
);
|
|
|
|
// Initialize metric collection manager with cache config
|
|
let metric_manager = MetricCollectionManager::new(&config.collectors, &config).await?;
|
|
info!("Metric collection manager initialized");
|
|
|
|
// Initialize notification manager
|
|
let notification_manager = NotificationManager::new(&config.notifications, &hostname)?;
|
|
info!("Notification manager initialized");
|
|
|
|
// Initialize host status manager
|
|
let host_status_manager = HostStatusManager::new(config.status_aggregation.clone());
|
|
info!("Host status manager initialized");
|
|
|
|
Ok(Self {
|
|
hostname,
|
|
config,
|
|
zmq_handler,
|
|
metric_manager,
|
|
notification_manager,
|
|
host_status_manager,
|
|
})
|
|
}
|
|
|
|
pub async fn run(&mut self, mut shutdown_rx: tokio::sync::oneshot::Receiver<()>) -> Result<()> {
|
|
info!("Starting agent main loop with separated collection and transmission");
|
|
|
|
// CRITICAL: Collect ALL data immediately at startup before entering the loop
|
|
info!("Performing initial FORCE collection of all metrics at startup");
|
|
if let Err(e) = self.collect_all_metrics_force().await {
|
|
error!("Failed to collect initial metrics: {}", e);
|
|
} else {
|
|
info!("Initial metric collection completed - all data cached and ready");
|
|
}
|
|
|
|
// Separate intervals for collection and transmission
|
|
let mut collection_interval =
|
|
interval(Duration::from_secs(self.config.collection_interval_seconds));
|
|
let mut transmission_interval = interval(Duration::from_secs(1)); // ZMQ broadcast every 1 second
|
|
let mut notification_interval = interval(Duration::from_secs(self.config.status_aggregation.notification_interval_seconds));
|
|
|
|
loop {
|
|
tokio::select! {
|
|
_ = collection_interval.tick() => {
|
|
// Only collect and cache metrics, no ZMQ transmission
|
|
if let Err(e) = self.collect_metrics_only().await {
|
|
error!("Failed to collect metrics: {}", e);
|
|
}
|
|
}
|
|
_ = transmission_interval.tick() => {
|
|
// Send all metrics via ZMQ every 1 second
|
|
if let Err(e) = self.broadcast_all_metrics().await {
|
|
error!("Failed to broadcast metrics: {}", e);
|
|
}
|
|
}
|
|
_ = notification_interval.tick() => {
|
|
// Process batched notifications
|
|
if let Err(e) = self.host_status_manager.process_pending_notifications(&mut self.notification_manager).await {
|
|
error!("Failed to process pending notifications: {}", e);
|
|
}
|
|
}
|
|
// Handle incoming commands (check periodically)
|
|
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
|
if let Err(e) = self.handle_commands().await {
|
|
error!("Error handling commands: {}", e);
|
|
}
|
|
}
|
|
_ = &mut shutdown_rx => {
|
|
info!("Shutdown signal received, stopping agent loop");
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
info!("Agent main loop stopped");
|
|
Ok(())
|
|
}
|
|
|
|
async fn collect_all_metrics_force(&mut self) -> Result<()> {
|
|
info!("Starting FORCE metric collection for startup");
|
|
|
|
// Force collect all metrics from all collectors immediately
|
|
let metrics = self.metric_manager.collect_all_metrics_force().await?;
|
|
|
|
if metrics.is_empty() {
|
|
error!("No metrics collected during force collection!");
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Force collected and cached {} metrics", metrics.len());
|
|
|
|
// Process metrics through status manager
|
|
self.process_metrics(&metrics).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn collect_metrics_only(&mut self) -> Result<()> {
|
|
debug!("Starting metric collection cycle (cache only)");
|
|
|
|
// Collect all metrics from all collectors and cache them
|
|
let metrics = self.metric_manager.collect_all_metrics().await?;
|
|
|
|
if metrics.is_empty() {
|
|
debug!("No metrics collected this cycle");
|
|
return Ok(());
|
|
}
|
|
|
|
debug!("Collected and cached {} metrics", metrics.len());
|
|
|
|
// Process metrics through status manager
|
|
self.process_metrics(&metrics).await;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn broadcast_all_metrics(&mut self) -> Result<()> {
|
|
debug!("Broadcasting all metrics via ZMQ");
|
|
|
|
// Get all current metrics from collectors
|
|
let mut metrics = self.metric_manager.collect_all_metrics().await?;
|
|
|
|
// Add the host status summary metric from status manager
|
|
let host_status_metric = self.host_status_manager.get_host_status_metric();
|
|
metrics.push(host_status_metric);
|
|
|
|
// Add agent version metric for cross-host version comparison
|
|
let version_metric = self.get_agent_version_metric();
|
|
metrics.push(version_metric);
|
|
|
|
if metrics.is_empty() {
|
|
debug!("No metrics to broadcast");
|
|
return Ok(());
|
|
}
|
|
|
|
debug!("Broadcasting {} metrics (including host status summary)", metrics.len());
|
|
|
|
// Create and send message with all current data
|
|
let message = MetricMessage::new(self.hostname.clone(), metrics);
|
|
self.zmq_handler.publish_metrics(&message).await?;
|
|
|
|
debug!("Metrics broadcasted successfully");
|
|
Ok(())
|
|
}
|
|
|
|
async fn process_metrics(&mut self, metrics: &[Metric]) {
|
|
for metric in metrics {
|
|
self.host_status_manager.process_metric(metric, &mut self.notification_manager).await;
|
|
}
|
|
}
|
|
|
|
/// Create agent version metric for cross-host version comparison
|
|
fn get_agent_version_metric(&self) -> Metric {
|
|
// Get version from executable path (same logic as main.rs get_version)
|
|
let version = self.get_agent_version();
|
|
|
|
Metric::new(
|
|
"agent_version".to_string(),
|
|
MetricValue::String(version),
|
|
Status::Ok,
|
|
)
|
|
}
|
|
|
|
/// Get agent version from executable path
|
|
fn get_agent_version(&self) -> String {
|
|
match std::env::current_exe() {
|
|
Ok(exe_path) => {
|
|
let exe_str = exe_path.to_string_lossy();
|
|
|
|
// Extract Nix store hash from path
|
|
if let Some(hash_part) = exe_str.strip_prefix("/nix/store/") {
|
|
if let Some(hash) = hash_part.split('-').next() {
|
|
if hash.len() >= 8 {
|
|
return hash[..8].to_string();
|
|
}
|
|
}
|
|
}
|
|
|
|
"unknown".to_string()
|
|
},
|
|
Err(_) => "unknown".to_string()
|
|
}
|
|
}
|
|
|
|
async fn handle_commands(&mut self) -> Result<()> {
|
|
// Try to receive commands (non-blocking)
|
|
match self.zmq_handler.try_receive_command() {
|
|
Ok(Some(command)) => {
|
|
info!("Received command: {:?}", command);
|
|
self.process_command(command).await?;
|
|
}
|
|
Ok(None) => {
|
|
// No command available - this is normal
|
|
}
|
|
Err(e) => {
|
|
error!("Error receiving command: {}", e);
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn process_command(&mut self, command: AgentCommand) -> Result<()> {
|
|
match command {
|
|
AgentCommand::CollectNow => {
|
|
info!("Processing CollectNow command");
|
|
if let Err(e) = self.collect_metrics_only().await {
|
|
error!("Failed to collect metrics on command: {}", e);
|
|
}
|
|
}
|
|
AgentCommand::SetInterval { seconds } => {
|
|
info!("Processing SetInterval command: {} seconds", seconds);
|
|
// Note: This would require modifying the interval, which is complex
|
|
// For now, just log the request
|
|
info!("Interval change requested but not implemented yet");
|
|
}
|
|
AgentCommand::ToggleCollector { name, enabled } => {
|
|
info!(
|
|
"Processing ToggleCollector command: {} -> {}",
|
|
name, enabled
|
|
);
|
|
// Note: This would require dynamic collector management
|
|
info!("Collector toggle requested but not implemented yet");
|
|
}
|
|
AgentCommand::Ping => {
|
|
info!("Processing Ping command - agent is alive");
|
|
// Could send a response back via ZMQ if needed
|
|
}
|
|
AgentCommand::ServiceControl { service_name, action } => {
|
|
info!("Processing ServiceControl command: {} {:?}", service_name, action);
|
|
if let Err(e) = self.handle_service_control(&service_name, &action).await {
|
|
error!("Failed to execute service control: {}", e);
|
|
}
|
|
}
|
|
AgentCommand::SystemRebuild { git_url, git_branch, working_dir, api_key_file } => {
|
|
info!("Processing SystemRebuild command: {} @ {} -> {}", git_url, git_branch, working_dir);
|
|
if let Err(e) = self.handle_system_rebuild(&git_url, &git_branch, &working_dir, api_key_file.as_deref()).await {
|
|
error!("Failed to execute system rebuild: {}", e);
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Handle systemd service control commands
|
|
async fn handle_service_control(&self, service_name: &str, action: &ServiceAction) -> Result<()> {
|
|
let action_str = match action {
|
|
ServiceAction::Start => "start",
|
|
ServiceAction::Stop => "stop",
|
|
ServiceAction::Restart => "restart",
|
|
ServiceAction::Status => "status",
|
|
};
|
|
|
|
info!("Executing systemctl {} {}", action_str, service_name);
|
|
|
|
let output = tokio::process::Command::new("sudo")
|
|
.arg("systemctl")
|
|
.arg(action_str)
|
|
.arg(service_name)
|
|
.output()
|
|
.await?;
|
|
|
|
if output.status.success() {
|
|
info!("Service {} {} completed successfully", service_name, action_str);
|
|
if !output.stdout.is_empty() {
|
|
debug!("stdout: {}", String::from_utf8_lossy(&output.stdout));
|
|
}
|
|
} else {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
error!("Service {} {} failed: {}", service_name, action_str, stderr);
|
|
return Err(anyhow::anyhow!("systemctl {} {} failed: {}", action_str, service_name, stderr));
|
|
}
|
|
|
|
// Force refresh metrics after service control to update service status
|
|
if matches!(action, ServiceAction::Start | ServiceAction::Stop | ServiceAction::Restart) {
|
|
info!("Triggering metric refresh after service control");
|
|
// Note: We can't call self.collect_metrics_only() here due to borrowing issues
|
|
// The next metric collection cycle will pick up the changes
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Handle NixOS system rebuild commands with real-time output streaming
|
|
async fn handle_system_rebuild(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
|
info!("Starting NixOS system rebuild: {} @ {} -> {}", git_url, git_branch, working_dir);
|
|
|
|
let command_id = format!("rebuild_{}", chrono::Utc::now().timestamp());
|
|
|
|
// Send initial status
|
|
self.send_command_output(&command_id, "SystemRebuild", "Starting NixOS system rebuild...").await?;
|
|
|
|
// Enable maintenance mode before rebuild
|
|
let maintenance_file = "/tmp/cm-maintenance";
|
|
if let Err(e) = tokio::fs::File::create(maintenance_file).await {
|
|
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to create maintenance mode file: {}", e)).await?;
|
|
} else {
|
|
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode enabled").await?;
|
|
}
|
|
|
|
// Clone or update repository
|
|
self.send_command_output(&command_id, "SystemRebuild", "Cloning/updating git repository...").await?;
|
|
let git_result = self.ensure_git_repository_with_output(&command_id, git_url, git_branch, working_dir, api_key_file).await;
|
|
|
|
if git_result.is_err() {
|
|
self.send_command_output(&command_id, "SystemRebuild", &format!("Git operation failed: {:?}", git_result)).await?;
|
|
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
|
|
return git_result;
|
|
}
|
|
|
|
self.send_command_output(&command_id, "SystemRebuild", "Git repository ready, starting nixos-rebuild...").await?;
|
|
|
|
// Execute nixos-rebuild with real-time output streaming
|
|
let rebuild_result = self.execute_nixos_rebuild_with_streaming(&command_id, working_dir).await;
|
|
|
|
// Always try to remove maintenance mode file
|
|
if let Err(e) = tokio::fs::remove_file(maintenance_file).await {
|
|
if e.kind() != std::io::ErrorKind::NotFound {
|
|
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to remove maintenance mode file: {}", e)).await?;
|
|
}
|
|
} else {
|
|
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode disabled").await?;
|
|
}
|
|
|
|
// Handle rebuild result
|
|
match rebuild_result {
|
|
Ok(()) => {
|
|
self.send_command_output(&command_id, "SystemRebuild", "✓ NixOS rebuild completed successfully!").await?;
|
|
}
|
|
Err(e) => {
|
|
self.send_command_output(&command_id, "SystemRebuild", &format!("✗ NixOS rebuild failed: {}", e)).await?;
|
|
}
|
|
}
|
|
|
|
// Signal completion
|
|
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
|
|
|
|
info!("System rebuild streaming completed");
|
|
Ok(())
|
|
}
|
|
|
|
/// Send command output line to dashboard
|
|
async fn send_command_output(&self, command_id: &str, command_type: &str, output_line: &str) -> Result<()> {
|
|
let message = CommandOutputMessage::new(
|
|
self.hostname.clone(),
|
|
command_id.to_string(),
|
|
command_type.to_string(),
|
|
output_line.to_string(),
|
|
false,
|
|
);
|
|
self.zmq_handler.publish_command_output(&message).await
|
|
}
|
|
|
|
/// Send command completion signal to dashboard
|
|
async fn send_command_output_complete(&self, command_id: &str, command_type: &str) -> Result<()> {
|
|
let message = CommandOutputMessage::new(
|
|
self.hostname.clone(),
|
|
command_id.to_string(),
|
|
command_type.to_string(),
|
|
"Command completed".to_string(),
|
|
true,
|
|
);
|
|
self.zmq_handler.publish_command_output(&message).await
|
|
}
|
|
|
|
/// Execute nixos-rebuild with real-time output streaming
|
|
async fn execute_nixos_rebuild_with_streaming(&self, command_id: &str, working_dir: &str) -> Result<()> {
|
|
use tokio::io::{AsyncBufReadExt, BufReader};
|
|
use tokio::process::Command;
|
|
|
|
let mut child = Command::new("sudo")
|
|
.arg("/run/current-system/sw/bin/nixos-rebuild")
|
|
.arg("switch")
|
|
.arg("--option")
|
|
.arg("sandbox")
|
|
.arg("false")
|
|
.arg("--flake")
|
|
.arg(".")
|
|
.current_dir(working_dir)
|
|
.stdout(std::process::Stdio::piped())
|
|
.stderr(std::process::Stdio::piped())
|
|
.spawn()?;
|
|
|
|
// Get stdout and stderr handles
|
|
let stdout = child.stdout.take().expect("Failed to get stdout");
|
|
let stderr = child.stderr.take().expect("Failed to get stderr");
|
|
|
|
// Create readers for both streams
|
|
let stdout_reader = BufReader::new(stdout);
|
|
let stderr_reader = BufReader::new(stderr);
|
|
|
|
let mut stdout_lines = stdout_reader.lines();
|
|
let mut stderr_lines = stderr_reader.lines();
|
|
|
|
// Stream output lines in real-time
|
|
loop {
|
|
tokio::select! {
|
|
// Read from stdout
|
|
line = stdout_lines.next_line() => {
|
|
match line {
|
|
Ok(Some(line)) => {
|
|
self.send_command_output(command_id, "SystemRebuild", &line).await?;
|
|
}
|
|
Ok(None) => {
|
|
// stdout closed
|
|
}
|
|
Err(e) => {
|
|
self.send_command_output(command_id, "SystemRebuild", &format!("stdout error: {}", e)).await?;
|
|
}
|
|
}
|
|
}
|
|
// Read from stderr
|
|
line = stderr_lines.next_line() => {
|
|
match line {
|
|
Ok(Some(line)) => {
|
|
self.send_command_output(command_id, "SystemRebuild", &line).await?;
|
|
}
|
|
Ok(None) => {
|
|
// stderr closed
|
|
}
|
|
Err(e) => {
|
|
self.send_command_output(command_id, "SystemRebuild", &format!("stderr error: {}", e)).await?;
|
|
}
|
|
}
|
|
}
|
|
// Wait for process completion
|
|
result = child.wait() => {
|
|
let status = result?;
|
|
if status.success() {
|
|
return Ok(());
|
|
} else {
|
|
return Err(anyhow::anyhow!("nixos-rebuild exited with status: {}", status));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Ensure git repository with output streaming
|
|
async fn ensure_git_repository_with_output(&self, command_id: &str, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
|
// This is a simplified version - we can enhance this later with git output streaming
|
|
self.ensure_git_repository(git_url, git_branch, working_dir, api_key_file).await
|
|
}
|
|
|
|
/// Ensure git repository is cloned and up to date with force clone approach
|
|
async fn ensure_git_repository(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
|
use std::path::Path;
|
|
|
|
// Read API key if provided
|
|
let auth_url = if let Some(key_file) = api_key_file {
|
|
match tokio::fs::read_to_string(key_file).await {
|
|
Ok(api_key) => {
|
|
let api_key = api_key.trim();
|
|
if !api_key.is_empty() {
|
|
// Convert https://gitea.cmtec.se/cm/nixosbox.git to https://token@gitea.cmtec.se/cm/nixosbox.git
|
|
if git_url.starts_with("https://") {
|
|
let url_without_protocol = &git_url[8..]; // Remove "https://"
|
|
format!("https://{}@{}", api_key, url_without_protocol)
|
|
} else {
|
|
info!("API key provided but URL is not HTTPS, using original URL");
|
|
git_url.to_string()
|
|
}
|
|
} else {
|
|
info!("API key file is empty, using original URL");
|
|
git_url.to_string()
|
|
}
|
|
}
|
|
Err(e) => {
|
|
info!("Could not read API key file {}: {}, using original URL", key_file, e);
|
|
git_url.to_string()
|
|
}
|
|
}
|
|
} else {
|
|
git_url.to_string()
|
|
};
|
|
|
|
// Always remove existing directory and do fresh clone for consistent state
|
|
let working_path = Path::new(working_dir);
|
|
if working_path.exists() {
|
|
info!("Removing existing repository directory: {}", working_dir);
|
|
if let Err(e) = tokio::fs::remove_dir_all(working_path).await {
|
|
error!("Failed to remove existing directory: {}", e);
|
|
return Err(anyhow::anyhow!("Failed to remove existing directory: {}", e));
|
|
}
|
|
}
|
|
|
|
info!("Force cloning git repository from {} (branch: {})", git_url, git_branch);
|
|
|
|
// Force clone with depth 1 for efficiency (no history needed for deployment)
|
|
let output = tokio::process::Command::new("git")
|
|
.arg("clone")
|
|
.arg("--depth")
|
|
.arg("1")
|
|
.arg("--branch")
|
|
.arg(git_branch)
|
|
.arg(&auth_url)
|
|
.arg(working_dir)
|
|
.output()
|
|
.await?;
|
|
|
|
if !output.status.success() {
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
error!("Git clone failed: {}", stderr);
|
|
return Err(anyhow::anyhow!("Git clone failed: {}", stderr));
|
|
}
|
|
|
|
info!("Git repository cloned successfully with latest state");
|
|
Ok(())
|
|
}
|
|
}
|