Replace complex SystemRebuild with simple SSH + tmux popup approach
All checks were successful
Build and Release / build-and-release (push) Successful in 2m6s
All checks were successful
Build and Release / build-and-release (push) Successful in 2m6s
- Remove all SystemRebuild command infrastructure from agent and dashboard
- Replace with direct tmux popup execution: ssh {user}@{host} {alias}
- Add configurable SSH user and rebuild alias in dashboard config
- Eliminate agent process crashes during rebuilds
- Simplify architecture by removing ZMQ command streaming complexity
- Clean up all related dead code and fix compilation warnings
Benefits:
- Process isolation: rebuild runs independently via SSH
- Crash resilience: agent/dashboard can restart without affecting rebuilds
- Configuration flexibility: SSH user and alias configurable per deployment
- Operational simplicity: standard tmux popup interface
This commit is contained in:
@@ -9,7 +9,7 @@ use crate::config::AgentConfig;
|
||||
use crate::metrics::MetricCollectionManager;
|
||||
use crate::notifications::NotificationManager;
|
||||
use crate::status::HostStatusManager;
|
||||
use cm_dashboard_shared::{CommandOutputMessage, Metric, MetricMessage, MetricValue, Status};
|
||||
use cm_dashboard_shared::{Metric, MetricMessage, MetricValue, Status};
|
||||
|
||||
pub struct Agent {
|
||||
hostname: String,
|
||||
@@ -254,12 +254,6 @@ impl Agent {
|
||||
error!("Failed to execute service control: {}", e);
|
||||
}
|
||||
}
|
||||
AgentCommand::SystemRebuild { git_url, git_branch, working_dir, api_key_file } => {
|
||||
info!("Processing SystemRebuild command: {} @ {} -> {}", git_url, git_branch, working_dir);
|
||||
if let Err(e) = self.handle_system_rebuild(&git_url, &git_branch, &working_dir, api_key_file.as_deref()).await {
|
||||
error!("Failed to execute system rebuild: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -303,272 +297,4 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle NixOS system rebuild commands with real-time output streaming
|
||||
async fn handle_system_rebuild(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
||||
info!("Starting NixOS system rebuild: {} @ {} -> {}", git_url, git_branch, working_dir);
|
||||
|
||||
let command_id = format!("rebuild_{}", chrono::Utc::now().timestamp());
|
||||
|
||||
// Send initial status
|
||||
self.send_command_output(&command_id, "SystemRebuild", "Starting NixOS system rebuild...").await?;
|
||||
|
||||
// Enable maintenance mode before rebuild
|
||||
let maintenance_file = "/tmp/cm-maintenance";
|
||||
if let Err(e) = tokio::fs::File::create(maintenance_file).await {
|
||||
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to create maintenance mode file: {}", e)).await?;
|
||||
} else {
|
||||
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode enabled").await?;
|
||||
}
|
||||
|
||||
// Clone or update repository
|
||||
self.send_command_output(&command_id, "SystemRebuild", "Cloning/updating git repository...").await?;
|
||||
let git_result = self.ensure_git_repository_with_output(&command_id, git_url, git_branch, working_dir, api_key_file).await;
|
||||
|
||||
if git_result.is_err() {
|
||||
self.send_command_output(&command_id, "SystemRebuild", &format!("Git operation failed: {:?}", git_result)).await?;
|
||||
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
|
||||
return git_result;
|
||||
}
|
||||
|
||||
self.send_command_output(&command_id, "SystemRebuild", "Git repository ready, starting nixos-rebuild...").await?;
|
||||
|
||||
// Execute nixos-rebuild with real-time output streaming
|
||||
let rebuild_result = self.execute_nixos_rebuild_with_streaming(&command_id, working_dir).await;
|
||||
|
||||
// Always try to remove maintenance mode file
|
||||
if let Err(e) = tokio::fs::remove_file(maintenance_file).await {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
self.send_command_output(&command_id, "SystemRebuild", &format!("Warning: Failed to remove maintenance mode file: {}", e)).await?;
|
||||
}
|
||||
} else {
|
||||
self.send_command_output(&command_id, "SystemRebuild", "Maintenance mode disabled").await?;
|
||||
}
|
||||
|
||||
// Handle rebuild result
|
||||
match rebuild_result {
|
||||
Ok(()) => {
|
||||
self.send_command_output(&command_id, "SystemRebuild", "✓ NixOS rebuild completed successfully!").await?;
|
||||
}
|
||||
Err(e) => {
|
||||
self.send_command_output(&command_id, "SystemRebuild", &format!("✗ NixOS rebuild failed: {}", e)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Signal completion
|
||||
self.send_command_output_complete(&command_id, "SystemRebuild").await?;
|
||||
|
||||
info!("System rebuild streaming completed");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send command output line to dashboard
|
||||
async fn send_command_output(&self, command_id: &str, command_type: &str, output_line: &str) -> Result<()> {
|
||||
let message = CommandOutputMessage::new(
|
||||
self.hostname.clone(),
|
||||
command_id.to_string(),
|
||||
command_type.to_string(),
|
||||
output_line.to_string(),
|
||||
false,
|
||||
);
|
||||
self.zmq_handler.publish_command_output(&message).await
|
||||
}
|
||||
|
||||
/// Send command completion signal to dashboard
|
||||
async fn send_command_output_complete(&self, command_id: &str, command_type: &str) -> Result<()> {
|
||||
let message = CommandOutputMessage::new(
|
||||
self.hostname.clone(),
|
||||
command_id.to_string(),
|
||||
command_type.to_string(),
|
||||
"Command completed".to_string(),
|
||||
true,
|
||||
);
|
||||
self.zmq_handler.publish_command_output(&message).await
|
||||
}
|
||||
|
||||
/// Execute nixos-rebuild via systemd service with journal streaming
|
||||
async fn execute_nixos_rebuild_with_streaming(&self, command_id: &str, _working_dir: &str) -> Result<()> {
|
||||
use tokio::io::{AsyncBufReadExt, BufReader};
|
||||
use tokio::process::Command;
|
||||
|
||||
self.send_command_output(command_id, "SystemRebuild", "Starting nixos-rebuild via systemd service...").await?;
|
||||
|
||||
// Start the cm-rebuild systemd service
|
||||
let start_result = Command::new("sudo")
|
||||
.arg("systemctl")
|
||||
.arg("start")
|
||||
.arg("cm-rebuild")
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !start_result.status.success() {
|
||||
let error = String::from_utf8_lossy(&start_result.stderr);
|
||||
return Err(anyhow::anyhow!("Failed to start cm-rebuild service: {}", error));
|
||||
}
|
||||
|
||||
self.send_command_output(command_id, "SystemRebuild", "✓ Service started, streaming output...").await?;
|
||||
|
||||
// Stream journal output in real-time
|
||||
let mut journal_child = Command::new("sudo")
|
||||
.arg("journalctl")
|
||||
.arg("-u")
|
||||
.arg("cm-rebuild")
|
||||
.arg("-f")
|
||||
.arg("--no-pager")
|
||||
.arg("--since")
|
||||
.arg("now")
|
||||
.stdout(std::process::Stdio::piped())
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
let stdout = journal_child.stdout.take().expect("Failed to get journalctl stdout");
|
||||
let mut reader = BufReader::new(stdout);
|
||||
let mut lines = reader.lines();
|
||||
|
||||
// Stream journal output and monitor service status
|
||||
let mut service_completed = false;
|
||||
let mut status_check_interval = tokio::time::interval(tokio::time::Duration::from_secs(2));
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
// Read journal output
|
||||
line = lines.next_line() => {
|
||||
match line {
|
||||
Ok(Some(line)) => {
|
||||
// Clean up journal format (remove timestamp/service prefix if needed)
|
||||
let clean_line = self.clean_journal_line(&line);
|
||||
self.send_command_output(command_id, "SystemRebuild", &clean_line).await?;
|
||||
}
|
||||
Ok(None) => {
|
||||
// journalctl stream ended
|
||||
break;
|
||||
}
|
||||
Err(_) => {
|
||||
// Error reading journal
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Periodically check service status
|
||||
_ = status_check_interval.tick() => {
|
||||
if let Ok(status_result) = Command::new("sudo")
|
||||
.arg("systemctl")
|
||||
.arg("is-active")
|
||||
.arg("cm-rebuild")
|
||||
.output()
|
||||
.await
|
||||
{
|
||||
let status = String::from_utf8_lossy(&status_result.stdout).trim().to_string();
|
||||
if status == "inactive" {
|
||||
service_completed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Kill journalctl process
|
||||
let _ = journal_child.kill().await;
|
||||
|
||||
// Check final service result
|
||||
let result = Command::new("sudo")
|
||||
.arg("systemctl")
|
||||
.arg("is-failed")
|
||||
.arg("cm-rebuild")
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
let output_string = String::from_utf8_lossy(&result.stdout);
|
||||
let is_failed = output_string.trim();
|
||||
if is_failed == "failed" {
|
||||
return Err(anyhow::anyhow!("cm-rebuild service failed"));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Clean journal line to remove systemd metadata
|
||||
fn clean_journal_line(&self, line: &str) -> String {
|
||||
// Remove timestamp and service name prefix from journal entries
|
||||
// Example: "Oct 26 10:30:15 cmbox cm-rebuild[1234]: actual output"
|
||||
// Becomes: "actual output"
|
||||
|
||||
if let Some(colon_pos) = line.rfind(": ") {
|
||||
line[colon_pos + 2..].to_string()
|
||||
} else {
|
||||
line.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// Ensure git repository with output streaming
|
||||
async fn ensure_git_repository_with_output(&self, command_id: &str, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
||||
// This is a simplified version - we can enhance this later with git output streaming
|
||||
self.ensure_git_repository(git_url, git_branch, working_dir, api_key_file).await
|
||||
}
|
||||
|
||||
/// Ensure git repository is cloned and up to date with force clone approach
|
||||
async fn ensure_git_repository(&self, git_url: &str, git_branch: &str, working_dir: &str, api_key_file: Option<&str>) -> Result<()> {
|
||||
use std::path::Path;
|
||||
|
||||
// Read API key if provided
|
||||
let auth_url = if let Some(key_file) = api_key_file {
|
||||
match tokio::fs::read_to_string(key_file).await {
|
||||
Ok(api_key) => {
|
||||
let api_key = api_key.trim();
|
||||
if !api_key.is_empty() {
|
||||
// Convert https://gitea.cmtec.se/cm/nixosbox.git to https://token@gitea.cmtec.se/cm/nixosbox.git
|
||||
if git_url.starts_with("https://") {
|
||||
let url_without_protocol = &git_url[8..]; // Remove "https://"
|
||||
format!("https://{}@{}", api_key, url_without_protocol)
|
||||
} else {
|
||||
info!("API key provided but URL is not HTTPS, using original URL");
|
||||
git_url.to_string()
|
||||
}
|
||||
} else {
|
||||
info!("API key file is empty, using original URL");
|
||||
git_url.to_string()
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
info!("Could not read API key file {}: {}, using original URL", key_file, e);
|
||||
git_url.to_string()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
git_url.to_string()
|
||||
};
|
||||
|
||||
// Always remove existing directory and do fresh clone for consistent state
|
||||
let working_path = Path::new(working_dir);
|
||||
if working_path.exists() {
|
||||
info!("Removing existing repository directory: {}", working_dir);
|
||||
if let Err(e) = tokio::fs::remove_dir_all(working_path).await {
|
||||
error!("Failed to remove existing directory: {}", e);
|
||||
return Err(anyhow::anyhow!("Failed to remove existing directory: {}", e));
|
||||
}
|
||||
}
|
||||
|
||||
info!("Force cloning git repository from {} (branch: {})", git_url, git_branch);
|
||||
|
||||
// Force clone with depth 1 for efficiency (no history needed for deployment)
|
||||
let output = tokio::process::Command::new("git")
|
||||
.arg("clone")
|
||||
.arg("--depth")
|
||||
.arg("1")
|
||||
.arg("--branch")
|
||||
.arg(git_branch)
|
||||
.arg(&auth_url)
|
||||
.arg(working_dir)
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
error!("Git clone failed: {}", stderr);
|
||||
return Err(anyhow::anyhow!("Git clone failed: {}", stderr));
|
||||
}
|
||||
|
||||
info!("Git repository cloned successfully with latest state");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
use anyhow::Result;
|
||||
use cm_dashboard_shared::{CommandOutputMessage, MessageEnvelope, MetricMessage};
|
||||
use cm_dashboard_shared::{MessageEnvelope, MetricMessage};
|
||||
use tracing::{debug, info};
|
||||
use zmq::{Context, Socket, SocketType};
|
||||
|
||||
@@ -65,23 +65,6 @@ impl ZmqHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Publish command output message via ZMQ
|
||||
pub async fn publish_command_output(&self, message: &CommandOutputMessage) -> Result<()> {
|
||||
debug!(
|
||||
"Publishing command output for host {} (command: {}): {}",
|
||||
message.hostname,
|
||||
message.command_type,
|
||||
message.output_line
|
||||
);
|
||||
|
||||
let envelope = MessageEnvelope::command_output(message.clone())?;
|
||||
let serialized = serde_json::to_vec(&envelope)?;
|
||||
|
||||
self.publisher.send(&serialized, 0)?;
|
||||
|
||||
debug!("Command output published successfully");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Send heartbeat (placeholder for future use)
|
||||
|
||||
@@ -122,13 +105,6 @@ pub enum AgentCommand {
|
||||
service_name: String,
|
||||
action: ServiceAction,
|
||||
},
|
||||
/// Rebuild NixOS system
|
||||
SystemRebuild {
|
||||
git_url: String,
|
||||
git_branch: String,
|
||||
working_dir: String,
|
||||
api_key_file: Option<String>,
|
||||
},
|
||||
}
|
||||
|
||||
/// Service control actions
|
||||
|
||||
@@ -141,6 +141,7 @@ pub struct NotificationConfig {
|
||||
pub rate_limit_minutes: u64,
|
||||
}
|
||||
|
||||
|
||||
impl AgentConfig {
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
loader::load_config(path)
|
||||
|
||||
Reference in New Issue
Block a user