Implement real-time process monitoring and fix UI hardcoded data
This commit addresses several key issues identified during development: Major Changes: - Replace hardcoded top CPU/RAM process display with real system data - Add intelligent process monitoring to CpuCollector using ps command - Fix disk metrics permission issues in systemd collector - Optimize service collection to focus on status, memory, and disk only - Update dashboard widgets to display live process information Process Monitoring Implementation: - Added collect_top_cpu_process() and collect_top_ram_process() methods - Implemented ps-based monitoring with accurate CPU percentages - Added filtering to prevent self-monitoring artifacts (ps commands) - Enhanced error handling and validation for process data - Dashboard now shows realistic values like "claude (PID 2974) 11.0%" Service Collection Optimization: - Removed CPU monitoring from systemd collector for efficiency - Enhanced service directory permission error logging - Simplified services widget to show essential metrics only - Fixed service-to-directory mapping accuracy UI and Dashboard Improvements: - Reorganized dashboard layout with btop-inspired multi-panel design - Updated system panel to include real top CPU/RAM process display - Enhanced widget formatting and data presentation - Removed placeholder/hardcoded data throughout the interface Technical Details: - Updated agent/src/collectors/cpu.rs with process monitoring - Modified dashboard/src/ui/mod.rs for real-time process display - Enhanced systemd collector error handling and disk metrics - Updated CLAUDE.md documentation with implementation details
This commit is contained in:
171
shared/src/cache.rs
Normal file
171
shared/src/cache.rs
Normal file
@@ -0,0 +1,171 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Cache tier configuration
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CacheTier {
|
||||
pub interval_seconds: u64,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
/// Cache configuration
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct CacheConfig {
|
||||
pub enabled: bool,
|
||||
pub default_ttl_seconds: u64,
|
||||
pub max_entries: usize,
|
||||
pub warming_timeout_seconds: u64,
|
||||
pub background_refresh_enabled: bool,
|
||||
pub cleanup_interval_seconds: u64,
|
||||
pub tiers: HashMap<String, CacheTier>,
|
||||
pub metric_assignments: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl Default for CacheConfig {
|
||||
fn default() -> Self {
|
||||
let mut tiers = HashMap::new();
|
||||
tiers.insert("realtime".to_string(), CacheTier {
|
||||
interval_seconds: 2,
|
||||
description: "Memory/CPU operations - no disk I/O (CPU, memory, service CPU/RAM)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_light".to_string(), CacheTier {
|
||||
interval_seconds: 60,
|
||||
description: "Light disk operations - 1 minute (service status checks)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_medium".to_string(), CacheTier {
|
||||
interval_seconds: 300,
|
||||
description: "Medium disk operations - 5 minutes (disk usage, service disk)".to_string(),
|
||||
});
|
||||
tiers.insert("disk_heavy".to_string(), CacheTier {
|
||||
interval_seconds: 900,
|
||||
description: "Heavy disk operations - 15 minutes (SMART data, backup status)".to_string(),
|
||||
});
|
||||
tiers.insert("static".to_string(), CacheTier {
|
||||
interval_seconds: 3600,
|
||||
description: "Hardware info that rarely changes - 1 hour".to_string(),
|
||||
});
|
||||
|
||||
let mut metric_assignments = HashMap::new();
|
||||
|
||||
// REALTIME (5s) - Memory/CPU operations, no disk I/O
|
||||
metric_assignments.insert("cpu_load_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("cpu_temperature_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("cpu_frequency_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("memory_*".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("service_*_cpu_percent".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("service_*_memory_mb".to_string(), "realtime".to_string());
|
||||
metric_assignments.insert("network_*".to_string(), "realtime".to_string());
|
||||
|
||||
// DISK_LIGHT (1min) - Light disk operations: service status checks
|
||||
metric_assignments.insert("service_*_status".to_string(), "disk_light".to_string());
|
||||
|
||||
// DISK_MEDIUM (5min) - Medium disk operations: du commands, disk usage
|
||||
metric_assignments.insert("service_*_disk_gb".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_tmp_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_usage_*".to_string(), "disk_medium".to_string());
|
||||
metric_assignments.insert("disk_*_size_*".to_string(), "disk_medium".to_string());
|
||||
|
||||
// DISK_HEAVY (15min) - Heavy disk operations: SMART data, backup status
|
||||
metric_assignments.insert("disk_*_temperature".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("disk_*_wear_percent".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("smart_*".to_string(), "disk_heavy".to_string());
|
||||
metric_assignments.insert("backup_*".to_string(), "disk_heavy".to_string());
|
||||
|
||||
Self {
|
||||
enabled: true,
|
||||
default_ttl_seconds: 30,
|
||||
max_entries: 10000,
|
||||
warming_timeout_seconds: 3,
|
||||
background_refresh_enabled: true,
|
||||
cleanup_interval_seconds: 1800,
|
||||
tiers,
|
||||
metric_assignments,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CacheConfig {
|
||||
/// Get the cache tier for a metric name
|
||||
pub fn get_tier_for_metric(&self, metric_name: &str) -> Option<&CacheTier> {
|
||||
// Find matching pattern
|
||||
for (pattern, tier_name) in &self.metric_assignments {
|
||||
if self.matches_pattern(metric_name, pattern) {
|
||||
return self.tiers.get(tier_name);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Check if metric name matches pattern (supports wildcards)
|
||||
fn matches_pattern(&self, metric_name: &str, pattern: &str) -> bool {
|
||||
if pattern.contains('*') {
|
||||
// Convert pattern to regex-like matching
|
||||
let pattern_parts: Vec<&str> = pattern.split('*').collect();
|
||||
|
||||
if pattern_parts.len() == 2 {
|
||||
let prefix = pattern_parts[0];
|
||||
let suffix = pattern_parts[1];
|
||||
|
||||
if suffix.is_empty() {
|
||||
// Pattern like "cpu_*" - just check prefix
|
||||
metric_name.starts_with(prefix)
|
||||
} else if prefix.is_empty() {
|
||||
// Pattern like "*_status" - just check suffix
|
||||
metric_name.ends_with(suffix)
|
||||
} else {
|
||||
// Pattern like "service_*_disk_gb" - check prefix and suffix
|
||||
metric_name.starts_with(prefix) && metric_name.ends_with(suffix)
|
||||
}
|
||||
} else {
|
||||
// More complex patterns - for now, just check if all parts are present
|
||||
pattern_parts.iter().all(|part| {
|
||||
part.is_empty() || metric_name.contains(part)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
metric_name == pattern
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cache interval for a metric
|
||||
pub fn get_cache_interval(&self, metric_name: &str) -> u64 {
|
||||
self.get_tier_for_metric(metric_name)
|
||||
.map(|tier| tier.interval_seconds)
|
||||
.unwrap_or(self.default_ttl_seconds)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_pattern_matching() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
assert!(config.matches_pattern("cpu_load_1min", "cpu_load_*"));
|
||||
assert!(config.matches_pattern("service_nginx_disk_gb", "service_*_disk_gb"));
|
||||
assert!(!config.matches_pattern("memory_usage_percent", "cpu_load_*"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tier_assignment() {
|
||||
let config = CacheConfig::default();
|
||||
|
||||
// Realtime (5s) - CPU/Memory operations
|
||||
assert_eq!(config.get_cache_interval("cpu_load_1min"), 5);
|
||||
assert_eq!(config.get_cache_interval("memory_usage_percent"), 5);
|
||||
assert_eq!(config.get_cache_interval("service_nginx_cpu_percent"), 5);
|
||||
|
||||
// Disk light (60s) - Service status
|
||||
assert_eq!(config.get_cache_interval("service_nginx_status"), 60);
|
||||
|
||||
// Disk medium (300s) - Disk usage
|
||||
assert_eq!(config.get_cache_interval("service_nginx_disk_gb"), 300);
|
||||
assert_eq!(config.get_cache_interval("disk_tmp_usage_percent"), 300);
|
||||
|
||||
// Disk heavy (900s) - SMART data
|
||||
assert_eq!(config.get_cache_interval("disk_nvme0_temperature"), 900);
|
||||
assert_eq!(config.get_cache_interval("smart_nvme0_wear_percent"), 900);
|
||||
}
|
||||
}
|
||||
@@ -1,23 +0,0 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum AgentType {
|
||||
Smart,
|
||||
Service,
|
||||
System,
|
||||
Backup,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsEnvelope {
|
||||
pub hostname: String,
|
||||
pub agent_type: AgentType,
|
||||
pub timestamp: u64,
|
||||
#[serde(default)]
|
||||
pub metrics: Value,
|
||||
}
|
||||
|
||||
// Alias for backward compatibility
|
||||
pub type MessageEnvelope = MetricsEnvelope;
|
||||
21
shared/src/error.rs
Normal file
21
shared/src/error.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum SharedError {
|
||||
#[error("Serialization error: {message}")]
|
||||
Serialization { message: String },
|
||||
|
||||
#[error("Invalid metric value: {message}")]
|
||||
InvalidMetric { message: String },
|
||||
|
||||
#[error("Protocol error: {message}")]
|
||||
Protocol { message: String },
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for SharedError {
|
||||
fn from(err: serde_json::Error) -> Self {
|
||||
SharedError::Serialization {
|
||||
message: err.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1 +1,9 @@
|
||||
pub mod envelope;
|
||||
pub mod cache;
|
||||
pub mod error;
|
||||
pub mod metrics;
|
||||
pub mod protocol;
|
||||
|
||||
pub use cache::*;
|
||||
pub use error::*;
|
||||
pub use metrics::*;
|
||||
pub use protocol::*;
|
||||
161
shared/src/metrics.rs
Normal file
161
shared/src/metrics.rs
Normal file
@@ -0,0 +1,161 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use chrono::{DateTime, Utc};
|
||||
|
||||
/// Individual metric with value, status, and metadata
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Metric {
|
||||
pub name: String,
|
||||
pub value: MetricValue,
|
||||
pub status: Status,
|
||||
pub timestamp: u64,
|
||||
pub description: Option<String>,
|
||||
pub unit: Option<String>,
|
||||
}
|
||||
|
||||
impl Metric {
|
||||
pub fn new(name: String, value: MetricValue, status: Status) -> Self {
|
||||
Self {
|
||||
name,
|
||||
value,
|
||||
status,
|
||||
timestamp: Utc::now().timestamp() as u64,
|
||||
description: None,
|
||||
unit: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_description(mut self, description: String) -> Self {
|
||||
self.description = Some(description);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn with_unit(mut self, unit: String) -> Self {
|
||||
self.unit = Some(unit);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// Typed metric values
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum MetricValue {
|
||||
Float(f32),
|
||||
Integer(i64),
|
||||
String(String),
|
||||
Boolean(bool),
|
||||
}
|
||||
|
||||
impl MetricValue {
|
||||
pub fn as_f32(&self) -> Option<f32> {
|
||||
match self {
|
||||
MetricValue::Float(f) => Some(*f),
|
||||
MetricValue::Integer(i) => Some(*i as f32),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_i64(&self) -> Option<i64> {
|
||||
match self {
|
||||
MetricValue::Integer(i) => Some(*i),
|
||||
MetricValue::Float(f) => Some(*f as i64),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_string(&self) -> String {
|
||||
match self {
|
||||
MetricValue::String(s) => s.clone(),
|
||||
MetricValue::Float(f) => f.to_string(),
|
||||
MetricValue::Integer(i) => i.to_string(),
|
||||
MetricValue::Boolean(b) => b.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_bool(&self) -> Option<bool> {
|
||||
match self {
|
||||
MetricValue::Boolean(b) => Some(*b),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Health status for metrics
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum Status {
|
||||
Ok,
|
||||
Warning,
|
||||
Critical,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Status {
|
||||
/// Aggregate multiple statuses - returns the worst status
|
||||
pub fn aggregate(statuses: &[Status]) -> Status {
|
||||
statuses.iter().max().copied().unwrap_or(Status::Unknown)
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Status {
|
||||
fn default() -> Self {
|
||||
Status::Unknown
|
||||
}
|
||||
}
|
||||
|
||||
/// Metric name registry - constants for all metric names
|
||||
pub mod registry {
|
||||
// CPU metrics
|
||||
pub const CPU_LOAD_1MIN: &str = "cpu_load_1min";
|
||||
pub const CPU_LOAD_5MIN: &str = "cpu_load_5min";
|
||||
pub const CPU_LOAD_15MIN: &str = "cpu_load_15min";
|
||||
pub const CPU_TEMPERATURE_CELSIUS: &str = "cpu_temperature_celsius";
|
||||
pub const CPU_FREQUENCY_MHZ: &str = "cpu_frequency_mhz";
|
||||
pub const CPU_USAGE_PERCENT: &str = "cpu_usage_percent";
|
||||
|
||||
// Memory metrics
|
||||
pub const MEMORY_USAGE_PERCENT: &str = "memory_usage_percent";
|
||||
pub const MEMORY_TOTAL_GB: &str = "memory_total_gb";
|
||||
pub const MEMORY_USED_GB: &str = "memory_used_gb";
|
||||
pub const MEMORY_AVAILABLE_GB: &str = "memory_available_gb";
|
||||
pub const MEMORY_SWAP_TOTAL_GB: &str = "memory_swap_total_gb";
|
||||
pub const MEMORY_SWAP_USED_GB: &str = "memory_swap_used_gb";
|
||||
|
||||
// Disk metrics (template - actual names include device)
|
||||
pub const DISK_USAGE_PERCENT_TEMPLATE: &str = "disk_{device}_usage_percent";
|
||||
pub const DISK_TEMPERATURE_CELSIUS_TEMPLATE: &str = "disk_{device}_temperature_celsius";
|
||||
pub const DISK_WEAR_PERCENT_TEMPLATE: &str = "disk_{device}_wear_percent";
|
||||
pub const DISK_SPARE_PERCENT_TEMPLATE: &str = "disk_{device}_spare_percent";
|
||||
pub const DISK_HOURS_TEMPLATE: &str = "disk_{device}_hours";
|
||||
pub const DISK_CAPACITY_GB_TEMPLATE: &str = "disk_{device}_capacity_gb";
|
||||
|
||||
// Service metrics (template - actual names include service)
|
||||
pub const SERVICE_STATUS_TEMPLATE: &str = "service_{name}_status";
|
||||
pub const SERVICE_MEMORY_MB_TEMPLATE: &str = "service_{name}_memory_mb";
|
||||
pub const SERVICE_CPU_PERCENT_TEMPLATE: &str = "service_{name}_cpu_percent";
|
||||
|
||||
// Backup metrics
|
||||
pub const BACKUP_STATUS: &str = "backup_status";
|
||||
pub const BACKUP_LAST_RUN_TIMESTAMP: &str = "backup_last_run_timestamp";
|
||||
pub const BACKUP_SIZE_GB: &str = "backup_size_gb";
|
||||
pub const BACKUP_DURATION_MINUTES: &str = "backup_duration_minutes";
|
||||
pub const BACKUP_NEXT_SCHEDULED_TIMESTAMP: &str = "backup_next_scheduled_timestamp";
|
||||
|
||||
// Network metrics (template - actual names include interface)
|
||||
pub const NETWORK_RX_BYTES_TEMPLATE: &str = "network_{interface}_rx_bytes";
|
||||
pub const NETWORK_TX_BYTES_TEMPLATE: &str = "network_{interface}_tx_bytes";
|
||||
pub const NETWORK_RX_PACKETS_TEMPLATE: &str = "network_{interface}_rx_packets";
|
||||
pub const NETWORK_TX_PACKETS_TEMPLATE: &str = "network_{interface}_tx_packets";
|
||||
|
||||
/// Generate disk metric name from template
|
||||
pub fn disk_metric(template: &str, device: &str) -> String {
|
||||
template.replace("{device}", device)
|
||||
}
|
||||
|
||||
/// Generate service metric name from template
|
||||
pub fn service_metric(template: &str, name: &str) -> String {
|
||||
template.replace("{name}", name)
|
||||
}
|
||||
|
||||
/// Generate network metric name from template
|
||||
pub fn network_metric(template: &str, interface: &str) -> String {
|
||||
template.replace("{interface}", interface)
|
||||
}
|
||||
}
|
||||
116
shared/src/protocol.rs
Normal file
116
shared/src/protocol.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use crate::metrics::Metric;
|
||||
|
||||
/// Message sent from agent to dashboard via ZMQ
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricMessage {
|
||||
pub hostname: String,
|
||||
pub timestamp: u64,
|
||||
pub metrics: Vec<Metric>,
|
||||
}
|
||||
|
||||
impl MetricMessage {
|
||||
pub fn new(hostname: String, metrics: Vec<Metric>) -> Self {
|
||||
Self {
|
||||
hostname,
|
||||
timestamp: chrono::Utc::now().timestamp() as u64,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Commands that can be sent from dashboard to agent
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum Command {
|
||||
/// Request immediate metric refresh
|
||||
RefreshMetrics,
|
||||
/// Request specific metrics by name
|
||||
RequestMetrics { metric_names: Vec<String> },
|
||||
/// Ping command for connection testing
|
||||
Ping,
|
||||
}
|
||||
|
||||
/// Response from agent to dashboard commands
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum CommandResponse {
|
||||
/// Acknowledgment of command
|
||||
Ack,
|
||||
/// Metrics response
|
||||
Metrics(Vec<Metric>),
|
||||
/// Pong response to ping
|
||||
Pong,
|
||||
/// Error response
|
||||
Error { message: String },
|
||||
}
|
||||
|
||||
/// ZMQ message envelope for routing
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct MessageEnvelope {
|
||||
pub message_type: MessageType,
|
||||
pub payload: Vec<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub enum MessageType {
|
||||
Metrics,
|
||||
Command,
|
||||
CommandResponse,
|
||||
Heartbeat,
|
||||
}
|
||||
|
||||
impl MessageEnvelope {
|
||||
pub fn metrics(message: MetricMessage) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Metrics,
|
||||
payload: serde_json::to_vec(&message)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn command(command: Command) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Command,
|
||||
payload: serde_json::to_vec(&command)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn command_response(response: CommandResponse) -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::CommandResponse,
|
||||
payload: serde_json::to_vec(&response)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn heartbeat() -> Result<Self, crate::SharedError> {
|
||||
Ok(Self {
|
||||
message_type: MessageType::Heartbeat,
|
||||
payload: Vec::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn decode_metrics(&self) -> Result<MetricMessage, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::Metrics => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected metrics message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_command(&self) -> Result<Command, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::Command => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected command message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decode_command_response(&self) -> Result<CommandResponse, crate::SharedError> {
|
||||
match self.message_type {
|
||||
MessageType::CommandResponse => Ok(serde_json::from_slice(&self.payload)?),
|
||||
_ => Err(crate::SharedError::Protocol {
|
||||
message: "Expected command response message".to_string(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user