Replace complex SystemRebuild with simple SSH + tmux popup approach
All checks were successful
Build and Release / build-and-release (push) Successful in 2m6s

- Remove all SystemRebuild command infrastructure from agent and dashboard
- Replace with direct tmux popup execution: ssh {user}@{host} {alias}
- Add configurable SSH user and rebuild alias in dashboard config
- Eliminate agent process crashes during rebuilds
- Simplify architecture by removing ZMQ command streaming complexity
- Clean up all related dead code and fix compilation warnings

Benefits:
- Process isolation: rebuild runs independently via SSH
- Crash resilience: agent/dashboard can restart without affecting rebuilds
- Configuration flexibility: SSH user and alias configurable per deployment
- Operational simplicity: standard tmux popup interface
This commit is contained in:
2025-10-27 14:25:45 +01:00
parent ac5d2d4db5
commit e61a845965
9 changed files with 73 additions and 425 deletions

View File

@@ -9,7 +9,7 @@ use crate::config::AgentConfig;
use crate::metrics::MetricCollectionManager;
use crate::notifications::NotificationManager;
use crate::status::HostStatusManager;
use cm_dashboard_shared::{CommandOutputMessage, Metric, MetricMessage, MetricValue, Status};
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
pub struct Agent {
hostname: String,
@@ -254,12 +254,6 @@ impl Agent {
error!("Failed to execute service control: {}", e);
}
}
AgentCommand::SystemRebuild { git_url, git_branch, working_dir, api_key_file } => {
info!("Processing SystemRebuild command: {} @ {} -> {}", git_url, git_branch, working_dir);
if let Err(e) = self.handle_system_rebuild(&git_url, &git_branch, &working_dir, api_key_file.as_deref()).await {
error!("Failed to execute system rebuild: {}", e);
}
}
}
Ok(())
}
@@ -303,272 +297,4 @@ impl Agent {
Ok(())
}
/// Handle NixOS system rebuild commands with real-time output streaming
async fn handle_system_rebuild(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
info!("Starting NixOS system rebuild: {} @ {} -> {}", git_url, git_branch, working_dir);
let command_id = format!("rebuild_{}", chrono::Utc::now().timestamp());
// Send initial status
self.send_command_output(&command_id, "SystemRebuild", "Starting NixOS system rebuild...").await?;
// Enable maintenance mode before rebuild
let maintenance_file = "/tmp/cm-maintenance";
if let Err(e) = tokio::fs::File::create(maintenance_file).await {
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to create maintenance mode file: {}", e)).await?;
} else {
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode enabled").await?;
}
// Clone or update repository
self.send_command_output(&command_id, "SystemRebuild", "Cloning/updating git repository...").await?;
let git_result = self.ensure_git_repository_with_output(&command_id, git_url, git_branch, working_dir, api_key_file).await;
if git_result.is_err() {
self.send_command_output(&command_id, "SystemRebuild", &format!("Git operation failed: {:?}", git_result)).await?;
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
return git_result;
}
self.send_command_output(&command_id, "SystemRebuild", "Git repository ready, starting nixos-rebuild...").await?;
// Execute nixos-rebuild with real-time output streaming
let rebuild_result = self.execute_nixos_rebuild_with_streaming(&command_id, working_dir).await;
// Always try to remove maintenance mode file
if let Err(e) = tokio::fs::remove_file(maintenance_file).await {
if e.kind() != std::io::ErrorKind::NotFound {
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to remove maintenance mode file: {}", e)).await?;
}
} else {
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode disabled").await?;
}
// Handle rebuild result
match rebuild_result {
Ok(()) => {
self.send_command_output(&command_id, "SystemRebuild", "✓ NixOS rebuild completed successfully!").await?;
}
Err(e) => {
self.send_command_output(&command_id, "SystemRebuild", &format!("✗ NixOS rebuild failed: {}", e)).await?;
}
}
// Signal completion
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
info!("System rebuild streaming completed");
Ok(())
}
/// Send command output line to dashboard
async fn send_command_output(&self, command_id: &str, command_type: &str, output_line: &str) -> Result<()> {
let message = CommandOutputMessage::new(
self.hostname.clone(),
command_id.to_string(),
command_type.to_string(),
output_line.to_string(),
false,
);
self.zmq_handler.publish_command_output(&message).await
}
/// Send command completion signal to dashboard
async fn send_command_output_complete(&self, command_id: &str, command_type: &str) -> Result<()> {
let message = CommandOutputMessage::new(
self.hostname.clone(),
command_id.to_string(),
command_type.to_string(),
"Command completed".to_string(),
true,
);
self.zmq_handler.publish_command_output(&message).await
}
/// Execute nixos-rebuild via systemd service with journal streaming
async fn execute_nixos_rebuild_with_streaming(&self, command_id: &str, _working_dir: &str) -> Result<()> {
use tokio::io::{AsyncBufReadExt, BufReader};
use tokio::process::Command;
self.send_command_output(command_id, "SystemRebuild", "Starting nixos-rebuild via systemd service...").await?;
// Start the cm-rebuild systemd service
let start_result = Command::new("sudo")
.arg("systemctl")
.arg("start")
.arg("cm-rebuild")
.output()
.await?;
if !start_result.status.success() {
let error = String::from_utf8_lossy(&start_result.stderr);
return Err(anyhow::anyhow!("Failed to start cm-rebuild service: {}", error));
}
self.send_command_output(command_id, "SystemRebuild", "✓ Service started, streaming output...").await?;
// Stream journal output in real-time
let mut journal_child = Command::new("sudo")
.arg("journalctl")
.arg("-u")
.arg("cm-rebuild")
.arg("-f")
.arg("--no-pager")
.arg("--since")
.arg("now")
.stdout(std::process::Stdio::piped())
.stderr(std::process::Stdio::piped())
.spawn()?;
let stdout = journal_child.stdout.take().expect("Failed to get journalctl stdout");
let mut reader = BufReader::new(stdout);
let mut lines = reader.lines();
// Stream journal output and monitor service status
let mut service_completed = false;
let mut status_check_interval = tokio::time::interval(tokio::time::Duration::from_secs(2));
loop {
tokio::select! {
// Read journal output
line = lines.next_line() => {
match line {
Ok(Some(line)) => {
// Clean up journal format (remove timestamp/service prefix if needed)
let clean_line = self.clean_journal_line(&line);
self.send_command_output(command_id, "SystemRebuild", &clean_line).await?;
}
Ok(None) => {
// journalctl stream ended
break;
}
Err(_) => {
// Error reading journal
break;
}
}
}
// Periodically check service status
_ = status_check_interval.tick() => {
if let Ok(status_result) = Command::new("sudo")
.arg("systemctl")
.arg("is-active")
.arg("cm-rebuild")
.output()
.await
{
let status = String::from_utf8_lossy(&status_result.stdout).trim().to_string();
if status == "inactive" {
service_completed = true;
break;
}
}
}
}
}
// Kill journalctl process
let _ = journal_child.kill().await;
// Check final service result
let result = Command::new("sudo")
.arg("systemctl")
.arg("is-failed")
.arg("cm-rebuild")
.output()
.await?;
let output_string = String::from_utf8_lossy(&result.stdout);
let is_failed = output_string.trim();
if is_failed == "failed" {
return Err(anyhow::anyhow!("cm-rebuild service failed"));
}
Ok(())
}
/// Clean journal line to remove systemd metadata
fn clean_journal_line(&self, line: &str) -> String {
// Remove timestamp and service name prefix from journal entries
// Example: "Oct 26 10:30:15 cmbox cm-rebuild[1234]: actual output"
// Becomes: "actual output"
if let Some(colon_pos) = line.rfind(": ") {
line[colon_pos + 2..].to_string()
} else {
line.to_string()
}
}
/// Ensure git repository with output streaming
async fn ensure_git_repository_with_output(&self, command_id: &str, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
// This is a simplified version - we can enhance this later with git output streaming
self.ensure_git_repository(git_url, git_branch, working_dir, api_key_file).await
}
/// Ensure git repository is cloned and up to date with force clone approach
async fn ensure_git_repository(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
use std::path::Path;
// Read API key if provided
let auth_url = if let Some(key_file) = api_key_file {
match tokio::fs::read_to_string(key_file).await {
Ok(api_key) => {
let api_key = api_key.trim();
if !api_key.is_empty() {
// Convert https://gitea.cmtec.se/cm/nixosbox.git to https://token@gitea.cmtec.se/cm/nixosbox.git
if git_url.starts_with("https://") {
let url_without_protocol = &git_url[8..]; // Remove "https://"
format!("https://{}@{}", api_key, url_without_protocol)
} else {
info!("API key provided but URL is not HTTPS, using original URL");
git_url.to_string()
}
} else {
info!("API key file is empty, using original URL");
git_url.to_string()
}
}
Err(e) => {
info!("Could not read API key file {}: {}, using original URL", key_file, e);
git_url.to_string()
}
}
} else {
git_url.to_string()
};
// Always remove existing directory and do fresh clone for consistent state
let working_path = Path::new(working_dir);
if working_path.exists() {
info!("Removing existing repository directory: {}", working_dir);
if let Err(e) = tokio::fs::remove_dir_all(working_path).await {
error!("Failed to remove existing directory: {}", e);
return Err(anyhow::anyhow!("Failed to remove existing directory: {}", e));
}
}
info!("Force cloning git repository from {} (branch: {})", git_url, git_branch);
// Force clone with depth 1 for efficiency (no history needed for deployment)
let output = tokio::process::Command::new("git")
.arg("clone")
.arg("--depth")
.arg("1")
.arg("--branch")
.arg(git_branch)
.arg(&auth_url)
.arg(working_dir)
.output()
.await?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
error!("Git clone failed: {}", stderr);
return Err(anyhow::anyhow!("Git clone failed: {}", stderr));
}
info!("Git repository cloned successfully with latest state");
Ok(())
}
}
}

View File

@@ -1,5 +1,5 @@
use anyhow::Result;
use cm_dashboard_shared::{CommandOutputMessage, MessageEnvelope, MetricMessage};
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
use tracing::{debug, info};
use zmq::{Context, Socket, SocketType};
@@ -65,23 +65,6 @@ impl ZmqHandler {
Ok(())
}
/// Publish command output message via ZMQ
pub async fn publish_command_output(&self, message: &CommandOutputMessage) -> Result<()> {
debug!(
"Publishing command output for host {} (command: {}): {}",
message.hostname,
message.command_type,
message.output_line
);
let envelope = MessageEnvelope::command_output(message.clone())?;
let serialized = serde_json::to_vec(&envelope)?;
self.publisher.send(&serialized, 0)?;
debug!("Command output published successfully");
Ok(())
}
/// Send heartbeat (placeholder for future use)
@@ -122,13 +105,6 @@ pub enum AgentCommand {
service_name: String,
action: ServiceAction,
},
/// Rebuild NixOS system
SystemRebuild {
git_url: String,
git_branch: String,
working_dir: String,
api_key_file: Option<String>,
},
}
/// Service control actions

View File

@@ -141,6 +141,7 @@ pub struct NotificationConfig {
pub rate_limit_minutes: u64,
}
impl AgentConfig {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
loader::load_config(path)