Fixes random host disconnections caused by blocking operations preventing timely ZMQ packet transmission. Changes: - Add run_command_with_timeout() wrapper using tokio for async command execution - Apply 10s timeout to smartctl (prevents 30+ second hangs on failing drives) - Apply 5s timeout to du, lsblk, systemctl list commands - Apply 3s timeout to systemctl show/is-active, df, ip commands - Apply 2s timeout to hostname command - Use system 'timeout' command for sync operations where async not needed Critical fixes: - smartctl: Failing drives could block for 30+ seconds per drive - du: Large directories (Docker, PostgreSQL) could block 10-30+ seconds - systemctl/docker: Commands could block indefinitely during system issues With 1-second collection interval and 10-second heartbeat timeout, any blocking operation >10s causes false "host offline" alerts. These timeouts ensure collection completes quickly even during system degradation.
111 lines
4.3 KiB
Rust
111 lines
4.3 KiB
Rust
use async_trait::async_trait;
|
|
use cm_dashboard_shared::AgentData;
|
|
use std::fs;
|
|
use std::process::Command;
|
|
use tracing::debug;
|
|
|
|
use super::{Collector, CollectorError};
|
|
|
|
/// NixOS system information collector with structured data output
|
|
///
|
|
/// This collector gathers NixOS-specific information like:
|
|
/// - System generation/build information
|
|
/// - Version information
|
|
/// - Agent version from Nix store path
|
|
pub struct NixOSCollector;
|
|
|
|
impl NixOSCollector {
|
|
pub fn new(_config: crate::config::NixOSConfig) -> Self {
|
|
Self
|
|
}
|
|
|
|
/// Collect NixOS system information and populate AgentData
|
|
async fn collect_nixos_info(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
debug!("Collecting NixOS system information");
|
|
|
|
// Set hostname (this is universal, not NixOS-specific)
|
|
agent_data.hostname = self.get_hostname().await.unwrap_or_else(|| "unknown".to_string());
|
|
|
|
// Set agent version from environment or Nix store path
|
|
agent_data.agent_version = self.get_agent_version().await;
|
|
|
|
// Set NixOS build/generation information
|
|
agent_data.build_version = self.get_nixos_generation().await;
|
|
|
|
// Set current timestamp
|
|
agent_data.timestamp = chrono::Utc::now().timestamp() as u64;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get system hostname
|
|
async fn get_hostname(&self) -> Option<String> {
|
|
match fs::read_to_string("/etc/hostname") {
|
|
Ok(hostname) => Some(hostname.trim().to_string()),
|
|
Err(_) => {
|
|
// Fallback to hostname command (with 2 second timeout)
|
|
match Command::new("timeout").args(["2", "hostname"]).output() {
|
|
Ok(output) => Some(String::from_utf8_lossy(&output.stdout).trim().to_string()),
|
|
Err(_) => None,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Get agent version from Nix store path or environment
|
|
async fn get_agent_version(&self) -> String {
|
|
// Try to extract version from the current executable path (Nix store)
|
|
if let Ok(current_exe) = std::env::current_exe() {
|
|
if let Some(exe_path) = current_exe.to_str() {
|
|
if exe_path.starts_with("/nix/store/") {
|
|
// Extract version from Nix store path
|
|
// Path format: /nix/store/hash-cm-dashboard-agent-v0.1.138/bin/cm-dashboard-agent
|
|
if let Some(store_part) = exe_path.strip_prefix("/nix/store/") {
|
|
if let Some(dash_pos) = store_part.find('-') {
|
|
let package_part = &store_part[dash_pos + 1..];
|
|
if let Some(bin_pos) = package_part.find("/bin/") {
|
|
let package_name = &package_part[..bin_pos];
|
|
// Extract version from package name
|
|
if let Some(version_start) = package_name.rfind("-v") {
|
|
return package_name[version_start + 1..].to_string();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to environment variable or default
|
|
std::env::var("CM_DASHBOARD_VERSION").unwrap_or_else(|_| "unknown".to_string())
|
|
}
|
|
|
|
/// Get NixOS system generation (build) information from git commit
|
|
async fn get_nixos_generation(&self) -> Option<String> {
|
|
// Try to read git commit hash from file written during rebuild
|
|
let commit_file = "/var/lib/cm-dashboard/git-commit";
|
|
match fs::read_to_string(commit_file) {
|
|
Ok(content) => {
|
|
let commit_hash = content.trim();
|
|
if commit_hash.len() >= 7 {
|
|
debug!("Found git commit hash: {}", commit_hash);
|
|
Some(commit_hash.to_string())
|
|
} else {
|
|
debug!("Git commit hash too short: {}", commit_hash);
|
|
None
|
|
}
|
|
}
|
|
Err(e) => {
|
|
debug!("Failed to read git commit file {}: {}", commit_file, e);
|
|
None
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for NixOSCollector {
|
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
self.collect_nixos_info(agent_data).await
|
|
}
|
|
} |