Implement real-time process monitoring and fix UI hardcoded data

This commit addresses several key issues identified during development:

Major Changes:
- Replace hardcoded top CPU/RAM process display with real system data
- Add intelligent process monitoring to CpuCollector using ps command
- Fix disk metrics permission issues in systemd collector
- Optimize service collection to focus on status, memory, and disk only
- Update dashboard widgets to display live process information

Process Monitoring Implementation:
- Added collect_top_cpu_process() and collect_top_ram_process() methods
- Implemented ps-based monitoring with accurate CPU percentages
- Added filtering to prevent self-monitoring artifacts (ps commands)
- Enhanced error handling and validation for process data
- Dashboard now shows realistic values like "claude (PID 2974) 11.0%"

Service Collection Optimization:
- Removed CPU monitoring from systemd collector for efficiency
- Enhanced service directory permission error logging
- Simplified services widget to show essential metrics only
- Fixed service-to-directory mapping accuracy

UI and Dashboard Improvements:
- Reorganized dashboard layout with btop-inspired multi-panel design
- Updated system panel to include real top CPU/RAM process display
- Enhanced widget formatting and data presentation
- Removed placeholder/hardcoded data throughout the interface

Technical Details:
- Updated agent/src/collectors/cpu.rs with process monitoring
- Modified dashboard/src/ui/mod.rs for real-time process display
- Enhanced systemd collector error handling and disk metrics
- Updated CLAUDE.md documentation with implementation details
This commit is contained in:
2025-10-16 23:55:05 +02:00
parent 7a664ef0fb
commit 8a36472a3d
81 changed files with 7702 additions and 9608 deletions

View File

@@ -0,0 +1,204 @@
use anyhow::Result;
use cm_dashboard_shared::{MetricMessage, MessageEnvelope, MessageType};
use tracing::{info, error, debug, warn};
use zmq::{Context, Socket, SocketType};
use std::time::Duration;
use crate::config::ZmqConfig;
/// Commands that can be sent to agents
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
pub enum AgentCommand {
/// Request immediate metric collection
CollectNow,
/// Change collection interval
SetInterval { seconds: u64 },
/// Enable/disable a collector
ToggleCollector { name: String, enabled: bool },
/// Request status/health check
Ping,
}
/// ZMQ consumer for receiving metrics from agents
pub struct ZmqConsumer {
subscriber: Socket,
config: ZmqConfig,
connected_hosts: std::collections::HashSet<String>,
}
impl ZmqConsumer {
pub async fn new(config: &ZmqConfig) -> Result<Self> {
let context = Context::new();
// Create subscriber socket
let subscriber = context.socket(SocketType::SUB)?;
// Set socket options
subscriber.set_rcvtimeo(1000)?; // 1 second timeout for non-blocking receives
subscriber.set_subscribe(b"")?; // Subscribe to all messages
info!("ZMQ consumer initialized");
Ok(Self {
subscriber,
config: config.clone(),
connected_hosts: std::collections::HashSet::new(),
})
}
/// Connect to a specific host's agent
pub async fn connect_to_host(&mut self, hostname: &str, port: u16) -> Result<()> {
let address = format!("tcp://{}:{}", hostname, port);
match self.subscriber.connect(&address) {
Ok(()) => {
info!("Connected to agent at {}", address);
self.connected_hosts.insert(hostname.to_string());
Ok(())
}
Err(e) => {
error!("Failed to connect to agent at {}: {}", address, e);
Err(anyhow::anyhow!("Failed to connect to {}: {}", address, e))
}
}
}
/// Connect to predefined hosts
pub async fn connect_to_predefined_hosts(&mut self, hosts: &[String]) -> Result<()> {
let default_port = self.config.subscriber_ports[0];
for hostname in hosts {
// Try to connect, but don't fail if some hosts are unreachable
if let Err(e) = self.connect_to_host(hostname, default_port).await {
warn!("Could not connect to {}: {}", hostname, e);
}
}
info!("Connected to {} out of {} configured hosts",
self.connected_hosts.len(), hosts.len());
Ok(())
}
/// Get list of newly connected hosts since last check
pub fn get_newly_connected_hosts(&self) -> Vec<String> {
// For now, return all connected hosts (could be enhanced with state tracking)
self.connected_hosts.iter().cloned().collect()
}
/// Receive metrics from any connected agent (non-blocking)
pub async fn receive_metrics(&mut self) -> Result<Option<MetricMessage>> {
match self.subscriber.recv_bytes(zmq::DONTWAIT) {
Ok(data) => {
debug!("Received {} bytes from ZMQ", data.len());
// Deserialize envelope
let envelope: MessageEnvelope = serde_json::from_slice(&data)
.map_err(|e| anyhow::anyhow!("Failed to deserialize envelope: {}", e))?;
// Check message type
match envelope.message_type {
MessageType::Metrics => {
let metrics = envelope.decode_metrics()
.map_err(|e| anyhow::anyhow!("Failed to decode metrics: {}", e))?;
debug!("Received {} metrics from {}",
metrics.metrics.len(), metrics.hostname);
Ok(Some(metrics))
}
MessageType::Heartbeat => {
debug!("Received heartbeat");
Ok(None) // Don't return heartbeats as metrics
}
_ => {
debug!("Received non-metrics message: {:?}", envelope.message_type);
Ok(None)
}
}
}
Err(zmq::Error::EAGAIN) => {
// No message available (non-blocking mode)
Ok(None)
}
Err(e) => {
error!("ZMQ receive error: {}", e);
Err(anyhow::anyhow!("ZMQ receive error: {}", e))
}
}
}
/// Get list of connected hosts
pub fn get_connected_hosts(&self) -> Vec<String> {
self.connected_hosts.iter().cloned().collect()
}
/// Check if connected to any hosts
pub fn has_connections(&self) -> bool {
!self.connected_hosts.is_empty()
}
}
/// ZMQ command sender for sending commands to agents
pub struct ZmqCommandSender {
context: Context,
config: ZmqConfig,
}
impl ZmqCommandSender {
pub fn new(config: &ZmqConfig) -> Result<Self> {
let context = Context::new();
info!("ZMQ command sender initialized");
Ok(Self {
context,
config: config.clone(),
})
}
/// Send a command to a specific agent
pub async fn send_command(&self, hostname: &str, command: AgentCommand) -> Result<()> {
// Create a new PUSH socket for this command (ZMQ best practice)
let socket = self.context.socket(SocketType::PUSH)?;
// Set socket options
socket.set_linger(1000)?; // Wait up to 1 second on close
socket.set_sndtimeo(5000)?; // 5 second send timeout
// Connect to agent's command port (6131)
let address = format!("tcp://{}:6131", hostname);
socket.connect(&address)?;
// Serialize command
let serialized = serde_json::to_vec(&command)?;
// Send command
socket.send(&serialized, 0)?;
info!("Sent command {:?} to agent at {}", command, hostname);
// Socket will be automatically closed when dropped
Ok(())
}
/// Send a command to all connected hosts
pub async fn broadcast_command(&self, hosts: &[String], command: AgentCommand) -> Result<Vec<String>> {
let mut failed_hosts = Vec::new();
for hostname in hosts {
if let Err(e) = self.send_command(hostname, command.clone()).await {
error!("Failed to send command to {}: {}", hostname, e);
failed_hosts.push(hostname.clone());
}
}
if failed_hosts.is_empty() {
info!("Successfully broadcast command {:?} to {} hosts", command, hosts.len());
} else {
warn!("Failed to send command to {} hosts: {:?}", failed_hosts.len(), failed_hosts);
}
Ok(failed_hosts)
}
}