Fixes random host disconnections caused by blocking operations preventing timely ZMQ packet transmission. Changes: - Add run_command_with_timeout() wrapper using tokio for async command execution - Apply 10s timeout to smartctl (prevents 30+ second hangs on failing drives) - Apply 5s timeout to du, lsblk, systemctl list commands - Apply 3s timeout to systemctl show/is-active, df, ip commands - Apply 2s timeout to hostname command - Use system 'timeout' command for sync operations where async not needed Critical fixes: - smartctl: Failing drives could block for 30+ seconds per drive - du: Large directories (Docker, PostgreSQL) could block 10-30+ seconds - systemctl/docker: Commands could block indefinitely during system issues With 1-second collection interval and 10-second heartbeat timeout, any blocking operation >10s causes false "host offline" alerts. These timeouts ensure collection completes quickly even during system degradation.
115 lines
3.9 KiB
Rust
115 lines
3.9 KiB
Rust
use async_trait::async_trait;
|
|
use cm_dashboard_shared::{AgentData};
|
|
use std::process::{Command, Output};
|
|
use std::time::Duration;
|
|
use tokio::time::timeout;
|
|
|
|
pub mod backup;
|
|
pub mod cpu;
|
|
pub mod disk;
|
|
pub mod error;
|
|
pub mod memory;
|
|
pub mod network;
|
|
pub mod nixos;
|
|
pub mod systemd;
|
|
|
|
pub use error::CollectorError;
|
|
|
|
/// Run a command with a timeout to prevent blocking
|
|
pub async fn run_command_with_timeout(mut cmd: Command, timeout_secs: u64) -> std::io::Result<Output> {
|
|
let timeout_duration = Duration::from_secs(timeout_secs);
|
|
|
|
match timeout(timeout_duration, tokio::task::spawn_blocking(move || cmd.output())).await {
|
|
Ok(Ok(result)) => result,
|
|
Ok(Err(e)) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
|
|
Err(_) => Err(std::io::Error::new(
|
|
std::io::ErrorKind::TimedOut,
|
|
format!("Command timed out after {} seconds", timeout_secs)
|
|
)),
|
|
}
|
|
}
|
|
|
|
|
|
/// Base trait for all collectors with direct structured data output
|
|
#[async_trait]
|
|
pub trait Collector: Send + Sync {
|
|
/// Collect data and populate AgentData directly with status evaluation
|
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError>;
|
|
}
|
|
|
|
/// CPU efficiency rules for all collectors
|
|
pub mod efficiency {
|
|
//! CRITICAL: All collectors must follow these efficiency rules to minimize system impact
|
|
//!
|
|
//! # FILE READING RULES
|
|
//! - Read entire files in single syscall when possible
|
|
//! - Use BufReader only for very large files (>4KB)
|
|
//! - Never read files character by character
|
|
//! - Cache file descriptors when safe (immutable paths)
|
|
//!
|
|
//! # PARSING RULES
|
|
//! - Use split() instead of regex for simple patterns
|
|
//! - Parse numbers with from_str() not complex parsing
|
|
//! - Avoid string allocations in hot paths
|
|
//! - Use str::trim() before parsing numbers
|
|
//!
|
|
//! # MEMORY ALLOCATION RULES
|
|
//! - Reuse Vec buffers when possible
|
|
//! - Pre-allocate collections with known sizes
|
|
//! - Use str slices instead of String when possible
|
|
//! - Avoid clone() in hot paths
|
|
//!
|
|
//! # SYSTEM CALL RULES
|
|
//! - Minimize syscalls - prefer single reads over multiple
|
|
//! - Use /proc filesystem efficiently
|
|
//! - Avoid spawning processes when /proc data available
|
|
//! - Cache static data (like CPU count)
|
|
//!
|
|
//! # ERROR HANDLING RULES
|
|
//! - Use Result<> but minimize allocation in error paths
|
|
//! - Log errors at debug level only to avoid I/O overhead
|
|
//! - Graceful degradation - missing metrics better than failing
|
|
//! - Never panic in collectors
|
|
//!
|
|
//! # CONCURRENCY RULES
|
|
//! - Collectors must be thread-safe but avoid locks
|
|
//! - Use atomic operations for simple counters
|
|
//! - Avoid shared mutable state between collections
|
|
//! - Each collection should be independent
|
|
}
|
|
|
|
/// Utility functions for efficient system data collection
|
|
pub mod utils {
|
|
use super::CollectorError;
|
|
use std::fs;
|
|
|
|
/// Read entire file content efficiently
|
|
pub fn read_proc_file(path: &str) -> Result<String, CollectorError> {
|
|
fs::read_to_string(path).map_err(|e| CollectorError::SystemRead {
|
|
path: path.to_string(),
|
|
error: e.to_string(),
|
|
})
|
|
}
|
|
|
|
/// Parse float from string slice efficiently
|
|
pub fn parse_f32(s: &str) -> Result<f32, CollectorError> {
|
|
s.trim()
|
|
.parse()
|
|
.map_err(|e: std::num::ParseFloatError| CollectorError::Parse {
|
|
value: s.to_string(),
|
|
error: e.to_string(),
|
|
})
|
|
}
|
|
|
|
/// Parse integer from string slice efficiently
|
|
pub fn parse_u64(s: &str) -> Result<u64, CollectorError> {
|
|
s.trim()
|
|
.parse()
|
|
.map_err(|e: std::num::ParseIntError| CollectorError::Parse {
|
|
value: s.to_string(),
|
|
error: e.to_string(),
|
|
})
|
|
}
|
|
|
|
}
|