Add refresh shortkey 'r' for on-demand metrics refresh
Implements ZMQ command protocol for dashboard-to-agent communication: - Agents listen on port 6131 for REQ/REP commands - Dashboard sends "refresh" command when 'r' key is pressed - Agents force immediate collection of all metrics via force_refresh_all() - Fresh data is broadcast immediately to dashboard - Updated help text to show "r: Refresh all metrics" Also includes metric-level caching architecture foundation for future granular control over individual metric update frequencies.
This commit is contained in:
parent
244cade7d8
commit
6bc7f97375
@ -286,6 +286,15 @@ Agent (calculations + thresholds) → Status → Dashboard (display only) → Ta
|
|||||||
- [x] NixOS borgbackup integration with automatic maintenance mode during backups
|
- [x] NixOS borgbackup integration with automatic maintenance mode during backups
|
||||||
- [x] System widget simplified to single row with C-states as description lines
|
- [x] System widget simplified to single row with C-states as description lines
|
||||||
- [x] CPU load thresholds updated to production values (9.0/10.0)
|
- [x] CPU load thresholds updated to production values (9.0/10.0)
|
||||||
|
- [x] **Smart caching system implementation (2025-10-15)**
|
||||||
|
- [x] Comprehensive intelligent caching with tiered collection intervals (RealTime/Fast/Medium/Slow/Static)
|
||||||
|
- [x] Cache warming for instant dashboard startup responsiveness
|
||||||
|
- [x] Background refresh and proactive cache invalidation strategies
|
||||||
|
- [x] CPU usage optimization from 9.5% to <2% through smart polling reduction
|
||||||
|
- [x] Cache key consistency fixes for proper collector data flow
|
||||||
|
- [x] ZMQ broadcast mechanism ensuring continuous data delivery to dashboard
|
||||||
|
- [x] Immich service quota detection fix (500GB instead of hardcoded 200GB)
|
||||||
|
- [x] Service-to-directory mapping for accurate disk usage calculation
|
||||||
|
|
||||||
**Production Configuration:**
|
**Production Configuration:**
|
||||||
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
- CPU load thresholds: Warning ≥ 9.0, Critical ≥ 10.0
|
||||||
|
|||||||
@ -10,6 +10,8 @@ mod notifications;
|
|||||||
mod smart_agent;
|
mod smart_agent;
|
||||||
mod cache;
|
mod cache;
|
||||||
mod cached_collector;
|
mod cached_collector;
|
||||||
|
mod metric_cache;
|
||||||
|
mod metric_collector;
|
||||||
|
|
||||||
use smart_agent::SmartAgent;
|
use smart_agent::SmartAgent;
|
||||||
|
|
||||||
|
|||||||
274
agent/src/metric_cache.rs
Normal file
274
agent/src/metric_cache.rs
Normal file
@ -0,0 +1,274 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
use tracing::{debug, info, trace};
|
||||||
|
use serde_json::Value;
|
||||||
|
|
||||||
|
use crate::cache::CacheTier;
|
||||||
|
use crate::collectors::AgentType;
|
||||||
|
|
||||||
|
/// Configuration for individual metric collection intervals
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MetricConfig {
|
||||||
|
pub name: String,
|
||||||
|
pub tier: CacheTier,
|
||||||
|
pub collect_fn: String, // Method name to call for this specific metric
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A group of related metrics with potentially different cache tiers
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct MetricGroup {
|
||||||
|
pub name: String,
|
||||||
|
pub agent_type: AgentType,
|
||||||
|
pub metrics: Vec<MetricConfig>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cached metric entry with metadata
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct MetricCacheEntry {
|
||||||
|
data: Value,
|
||||||
|
last_updated: Instant,
|
||||||
|
last_accessed: Instant,
|
||||||
|
access_count: u64,
|
||||||
|
tier: CacheTier,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCacheEntry {
|
||||||
|
fn new(data: Value, tier: CacheTier) -> Self {
|
||||||
|
let now = Instant::now();
|
||||||
|
Self {
|
||||||
|
data,
|
||||||
|
last_updated: now,
|
||||||
|
last_accessed: now,
|
||||||
|
access_count: 1,
|
||||||
|
tier,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_stale(&self) -> bool {
|
||||||
|
self.last_updated.elapsed() > self.tier.max_age()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn access(&mut self) -> Value {
|
||||||
|
self.last_accessed = Instant::now();
|
||||||
|
self.access_count += 1;
|
||||||
|
self.data.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn update(&mut self, data: Value) {
|
||||||
|
self.data = data;
|
||||||
|
self.last_updated = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Metric-level cache manager with per-metric tier control
|
||||||
|
pub struct MetricCache {
|
||||||
|
// Key format: "agent_type.metric_name"
|
||||||
|
cache: RwLock<HashMap<String, MetricCacheEntry>>,
|
||||||
|
metric_groups: HashMap<AgentType, MetricGroup>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCache {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
let mut metric_groups = HashMap::new();
|
||||||
|
|
||||||
|
// Define metric groups with per-metric cache tiers
|
||||||
|
metric_groups.insert(
|
||||||
|
AgentType::System,
|
||||||
|
MetricGroup {
|
||||||
|
name: "system".to_string(),
|
||||||
|
agent_type: AgentType::System,
|
||||||
|
metrics: vec![
|
||||||
|
MetricConfig {
|
||||||
|
name: "cpu_load".to_string(),
|
||||||
|
tier: CacheTier::RealTime,
|
||||||
|
collect_fn: "get_cpu_load".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "cpu_temperature".to_string(),
|
||||||
|
tier: CacheTier::RealTime,
|
||||||
|
collect_fn: "get_cpu_temperature".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "memory".to_string(),
|
||||||
|
tier: CacheTier::RealTime,
|
||||||
|
collect_fn: "get_memory_info".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "top_processes".to_string(),
|
||||||
|
tier: CacheTier::Fast,
|
||||||
|
collect_fn: "get_top_processes".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "cstate".to_string(),
|
||||||
|
tier: CacheTier::Medium,
|
||||||
|
collect_fn: "get_cpu_cstate_info".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "users".to_string(),
|
||||||
|
tier: CacheTier::Medium,
|
||||||
|
collect_fn: "get_logged_in_users".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
metric_groups.insert(
|
||||||
|
AgentType::Service,
|
||||||
|
MetricGroup {
|
||||||
|
name: "service".to_string(),
|
||||||
|
agent_type: AgentType::Service,
|
||||||
|
metrics: vec![
|
||||||
|
MetricConfig {
|
||||||
|
name: "cpu_usage".to_string(),
|
||||||
|
tier: CacheTier::RealTime,
|
||||||
|
collect_fn: "get_service_cpu_usage".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "memory_usage".to_string(),
|
||||||
|
tier: CacheTier::Fast,
|
||||||
|
collect_fn: "get_service_memory_usage".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "status".to_string(),
|
||||||
|
tier: CacheTier::Medium,
|
||||||
|
collect_fn: "get_service_status".to_string(),
|
||||||
|
},
|
||||||
|
MetricConfig {
|
||||||
|
name: "disk_usage".to_string(),
|
||||||
|
tier: CacheTier::Slow,
|
||||||
|
collect_fn: "get_service_disk_usage".to_string(),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
cache: RwLock::new(HashMap::new()),
|
||||||
|
metric_groups,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get metric configuration for a specific agent type and metric
|
||||||
|
pub fn get_metric_config(&self, agent_type: &AgentType, metric_name: &str) -> Option<&MetricConfig> {
|
||||||
|
self.metric_groups
|
||||||
|
.get(agent_type)?
|
||||||
|
.metrics
|
||||||
|
.iter()
|
||||||
|
.find(|m| m.name == metric_name)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get cached metric if available and not stale
|
||||||
|
pub async fn get_metric(&self, agent_type: &AgentType, metric_name: &str) -> Option<Value> {
|
||||||
|
let key = format!("{:?}.{}", agent_type, metric_name);
|
||||||
|
let mut cache = self.cache.write().await;
|
||||||
|
|
||||||
|
if let Some(entry) = cache.get_mut(&key) {
|
||||||
|
if !entry.is_stale() {
|
||||||
|
trace!("Metric cache hit for {}: {}ms old", key, entry.last_updated.elapsed().as_millis());
|
||||||
|
return Some(entry.access());
|
||||||
|
} else {
|
||||||
|
debug!("Metric cache entry for {} is stale ({}ms old)", key, entry.last_updated.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Store metric in cache
|
||||||
|
pub async fn put_metric(&self, agent_type: &AgentType, metric_name: &str, data: Value) {
|
||||||
|
let key = format!("{:?}.{}", agent_type, metric_name);
|
||||||
|
|
||||||
|
// Get tier for this metric
|
||||||
|
let tier = self
|
||||||
|
.get_metric_config(agent_type, metric_name)
|
||||||
|
.map(|config| config.tier)
|
||||||
|
.unwrap_or(CacheTier::Medium);
|
||||||
|
|
||||||
|
let mut cache = self.cache.write().await;
|
||||||
|
|
||||||
|
if let Some(entry) = cache.get_mut(&key) {
|
||||||
|
entry.update(data);
|
||||||
|
trace!("Updated metric cache entry for {}", key);
|
||||||
|
} else {
|
||||||
|
cache.insert(key.clone(), MetricCacheEntry::new(data, tier));
|
||||||
|
trace!("Created new metric cache entry for {} (tier: {:?})", key, tier);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if metric needs refresh based on its specific tier
|
||||||
|
pub async fn metric_needs_refresh(&self, agent_type: &AgentType, metric_name: &str) -> bool {
|
||||||
|
let key = format!("{:?}.{}", agent_type, metric_name);
|
||||||
|
let cache = self.cache.read().await;
|
||||||
|
|
||||||
|
if let Some(entry) = cache.get(&key) {
|
||||||
|
entry.is_stale()
|
||||||
|
} else {
|
||||||
|
// No cache entry exists
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get metrics that need refresh for a specific cache tier
|
||||||
|
pub async fn get_metrics_needing_refresh(&self, tier: CacheTier) -> Vec<(AgentType, String)> {
|
||||||
|
let cache = self.cache.read().await;
|
||||||
|
let mut metrics_to_refresh = Vec::new();
|
||||||
|
|
||||||
|
// Find all configured metrics for this tier
|
||||||
|
for (agent_type, group) in &self.metric_groups {
|
||||||
|
for metric_config in &group.metrics {
|
||||||
|
if metric_config.tier == tier {
|
||||||
|
let key = format!("{:?}.{}", agent_type, metric_config.name);
|
||||||
|
|
||||||
|
// Check if this metric needs refresh
|
||||||
|
let needs_refresh = if let Some(entry) = cache.get(&key) {
|
||||||
|
entry.is_stale()
|
||||||
|
} else {
|
||||||
|
true // No cache entry = needs initial collection
|
||||||
|
};
|
||||||
|
|
||||||
|
if needs_refresh {
|
||||||
|
metrics_to_refresh.push((agent_type.clone(), metric_config.name.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics_to_refresh
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all metrics for a specific tier (for scheduling)
|
||||||
|
pub fn get_metrics_for_tier(&self, tier: CacheTier) -> Vec<(AgentType, String)> {
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
|
||||||
|
for (agent_type, group) in &self.metric_groups {
|
||||||
|
for metric_config in &group.metrics {
|
||||||
|
if metric_config.tier == tier {
|
||||||
|
metrics.push((agent_type.clone(), metric_config.name.clone()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cleanup old metric entries
|
||||||
|
pub async fn cleanup(&self) {
|
||||||
|
let mut cache = self.cache.write().await;
|
||||||
|
let initial_size = cache.len();
|
||||||
|
|
||||||
|
let cutoff = Instant::now() - Duration::from_secs(3600); // 1 hour
|
||||||
|
cache.retain(|key, entry| {
|
||||||
|
let keep = entry.last_accessed > cutoff;
|
||||||
|
if !keep {
|
||||||
|
trace!("Removing stale metric cache entry: {}", key);
|
||||||
|
}
|
||||||
|
keep
|
||||||
|
});
|
||||||
|
|
||||||
|
let removed = initial_size - cache.len();
|
||||||
|
if removed > 0 {
|
||||||
|
info!("Metric cache cleanup: removed {} stale entries ({} remaining)", removed, cache.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
150
agent/src/metric_collector.rs
Normal file
150
agent/src/metric_collector.rs
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
use async_trait::async_trait;
|
||||||
|
use serde_json::Value;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use crate::collectors::{CollectorError, AgentType};
|
||||||
|
use crate::metric_cache::MetricCache;
|
||||||
|
|
||||||
|
/// Trait for collectors that support metric-level granular collection
|
||||||
|
#[async_trait]
|
||||||
|
pub trait MetricCollector {
|
||||||
|
/// Get the agent type this collector handles
|
||||||
|
fn agent_type(&self) -> AgentType;
|
||||||
|
|
||||||
|
/// Get the name of this collector
|
||||||
|
fn name(&self) -> &str;
|
||||||
|
|
||||||
|
/// Collect a specific metric by name
|
||||||
|
async fn collect_metric(&self, metric_name: &str) -> Result<Value, CollectorError>;
|
||||||
|
|
||||||
|
/// Get list of all metrics this collector can provide
|
||||||
|
fn available_metrics(&self) -> Vec<String>;
|
||||||
|
|
||||||
|
/// Collect multiple metrics efficiently (batch collection)
|
||||||
|
async fn collect_metrics(&self, metric_names: &[String]) -> Result<HashMap<String, Value>, CollectorError> {
|
||||||
|
let mut results = HashMap::new();
|
||||||
|
|
||||||
|
// Default implementation: collect each metric individually
|
||||||
|
for metric_name in metric_names {
|
||||||
|
match self.collect_metric(metric_name).await {
|
||||||
|
Ok(value) => {
|
||||||
|
results.insert(metric_name.clone(), value);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
// Log error but continue with other metrics
|
||||||
|
tracing::warn!("Failed to collect metric {}: {}", metric_name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect all metrics this collector provides
|
||||||
|
async fn collect_all_metrics(&self) -> Result<HashMap<String, Value>, CollectorError> {
|
||||||
|
let metrics = self.available_metrics();
|
||||||
|
self.collect_metrics(&metrics).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manager for metric-based collection with caching
|
||||||
|
pub struct MetricCollectionManager {
|
||||||
|
collectors: HashMap<AgentType, Box<dyn MetricCollector + Send + Sync>>,
|
||||||
|
cache: MetricCache,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MetricCollectionManager {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
collectors: HashMap::new(),
|
||||||
|
cache: MetricCache::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register a metric collector
|
||||||
|
pub fn register_collector(&mut self, collector: Box<dyn MetricCollector + Send + Sync>) {
|
||||||
|
let agent_type = collector.agent_type();
|
||||||
|
self.collectors.insert(agent_type, collector);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect a specific metric with caching
|
||||||
|
pub async fn get_metric(&self, agent_type: &AgentType, metric_name: &str) -> Result<Value, CollectorError> {
|
||||||
|
// Try cache first
|
||||||
|
if let Some(cached_value) = self.cache.get_metric(agent_type, metric_name).await {
|
||||||
|
return Ok(cached_value);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cache miss - collect fresh data
|
||||||
|
if let Some(collector) = self.collectors.get(agent_type) {
|
||||||
|
let value = collector.collect_metric(metric_name).await?;
|
||||||
|
|
||||||
|
// Store in cache
|
||||||
|
self.cache.put_metric(agent_type, metric_name, value.clone()).await;
|
||||||
|
|
||||||
|
Ok(value)
|
||||||
|
} else {
|
||||||
|
Err(CollectorError::ConfigError {
|
||||||
|
message: format!("No collector registered for agent type {:?}", agent_type),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collect multiple metrics for an agent type
|
||||||
|
pub async fn get_metrics(&self, agent_type: &AgentType, metric_names: &[String]) -> Result<HashMap<String, Value>, CollectorError> {
|
||||||
|
let mut results = HashMap::new();
|
||||||
|
let mut metrics_to_collect = Vec::new();
|
||||||
|
|
||||||
|
// Check cache for each metric
|
||||||
|
for metric_name in metric_names {
|
||||||
|
if let Some(cached_value) = self.cache.get_metric(agent_type, metric_name).await {
|
||||||
|
results.insert(metric_name.clone(), cached_value);
|
||||||
|
} else {
|
||||||
|
metrics_to_collect.push(metric_name.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect uncached metrics
|
||||||
|
if !metrics_to_collect.is_empty() {
|
||||||
|
if let Some(collector) = self.collectors.get(agent_type) {
|
||||||
|
let fresh_metrics = collector.collect_metrics(&metrics_to_collect).await?;
|
||||||
|
|
||||||
|
// Store in cache and add to results
|
||||||
|
for (metric_name, value) in fresh_metrics {
|
||||||
|
self.cache.put_metric(agent_type, &metric_name, value.clone()).await;
|
||||||
|
results.insert(metric_name, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get metrics that need refresh for a specific tier
|
||||||
|
pub async fn get_stale_metrics(&self, tier: crate::cache::CacheTier) -> Vec<(AgentType, String)> {
|
||||||
|
self.cache.get_metrics_needing_refresh(tier).await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Force refresh specific metrics
|
||||||
|
pub async fn refresh_metrics(&self, metrics: &[(AgentType, String)]) -> Result<(), CollectorError> {
|
||||||
|
for (agent_type, metric_name) in metrics {
|
||||||
|
if let Some(collector) = self.collectors.get(agent_type) {
|
||||||
|
match collector.collect_metric(metric_name).await {
|
||||||
|
Ok(value) => {
|
||||||
|
self.cache.put_metric(agent_type, metric_name, value).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to refresh metric {}.{}: {}",
|
||||||
|
format!("{:?}", agent_type), metric_name, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cleanup old cache entries
|
||||||
|
pub async fn cleanup_cache(&self) {
|
||||||
|
self.cache.cleanup().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -22,6 +22,7 @@ use crate::notifications::{NotificationManager, NotificationConfig};
|
|||||||
pub struct SmartAgent {
|
pub struct SmartAgent {
|
||||||
hostname: String,
|
hostname: String,
|
||||||
zmq_socket: Socket,
|
zmq_socket: Socket,
|
||||||
|
zmq_command_socket: Socket,
|
||||||
notification_manager: NotificationManager,
|
notification_manager: NotificationManager,
|
||||||
cache: Arc<SmartCache>,
|
cache: Arc<SmartCache>,
|
||||||
scheduler: CollectionScheduler,
|
scheduler: CollectionScheduler,
|
||||||
@ -40,6 +41,12 @@ impl SmartAgent {
|
|||||||
socket.bind("tcp://0.0.0.0:6130")?;
|
socket.bind("tcp://0.0.0.0:6130")?;
|
||||||
info!("ZMQ publisher bound to tcp://0.0.0.0:6130");
|
info!("ZMQ publisher bound to tcp://0.0.0.0:6130");
|
||||||
|
|
||||||
|
// Setup command socket (REP)
|
||||||
|
let command_socket = context.socket(SocketType::REP)?;
|
||||||
|
command_socket.bind("tcp://0.0.0.0:6131")?;
|
||||||
|
command_socket.set_rcvtimeo(1000)?; // 1 second timeout for non-blocking
|
||||||
|
info!("ZMQ command socket bound to tcp://0.0.0.0:6131");
|
||||||
|
|
||||||
// Setup notifications
|
// Setup notifications
|
||||||
let notification_config = NotificationConfig {
|
let notification_config = NotificationConfig {
|
||||||
enabled: true,
|
enabled: true,
|
||||||
@ -127,6 +134,7 @@ impl SmartAgent {
|
|||||||
Ok(Self {
|
Ok(Self {
|
||||||
hostname,
|
hostname,
|
||||||
zmq_socket: socket,
|
zmq_socket: socket,
|
||||||
|
zmq_command_socket: command_socket,
|
||||||
notification_manager,
|
notification_manager,
|
||||||
cache,
|
cache,
|
||||||
scheduler,
|
scheduler,
|
||||||
@ -180,6 +188,9 @@ impl SmartAgent {
|
|||||||
_ = stats_interval.tick() => {
|
_ = stats_interval.tick() => {
|
||||||
self.log_cache_stats().await;
|
self.log_cache_stats().await;
|
||||||
}
|
}
|
||||||
|
_ = self.handle_commands() => {
|
||||||
|
// Commands handled in background
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -379,4 +390,76 @@ impl SmartAgent {
|
|||||||
debug!(" {:?}: {} entries", tier, count);
|
debug!(" {:?}: {} entries", tier, count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Handle incoming commands from dashboard (non-blocking)
|
||||||
|
async fn handle_commands(&mut self) {
|
||||||
|
// Check for commands with non-blocking receive
|
||||||
|
match self.zmq_command_socket.recv_string(zmq::DONTWAIT) {
|
||||||
|
Ok(Ok(command)) => {
|
||||||
|
info!("Received command: {}", command);
|
||||||
|
match command.as_str() {
|
||||||
|
"refresh" => {
|
||||||
|
info!("Processing refresh command - forcing immediate collection");
|
||||||
|
self.force_refresh_all().await;
|
||||||
|
|
||||||
|
// Send response
|
||||||
|
let response = "refresh_started";
|
||||||
|
if let Err(e) = self.zmq_command_socket.send(response, 0) {
|
||||||
|
warn!("Failed to send command response: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
warn!("Unknown command: {}", command);
|
||||||
|
let response = "unknown_command";
|
||||||
|
if let Err(e) = self.zmq_command_socket.send(response, 0) {
|
||||||
|
warn!("Failed to send error response: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
warn!("String conversion error: {:?}", e);
|
||||||
|
}
|
||||||
|
Err(zmq::Error::EAGAIN) => {
|
||||||
|
// No message available - this is normal
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("ZMQ command receive error: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Force immediate collection of all metrics
|
||||||
|
async fn force_refresh_all(&mut self) {
|
||||||
|
info!("Force refreshing all collectors");
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
let mut refreshed = 0;
|
||||||
|
let mut outputs = Vec::new();
|
||||||
|
|
||||||
|
for collector in &self.cached_collectors {
|
||||||
|
match collector.collect_fresh().await {
|
||||||
|
Ok(output) => {
|
||||||
|
// Send immediately via ZMQ
|
||||||
|
if let Err(e) = self.send_metrics(&output.agent_type, &output.data).await {
|
||||||
|
error!("Failed to send refreshed metrics for {}: {}", collector.name(), e);
|
||||||
|
} else {
|
||||||
|
refreshed += 1;
|
||||||
|
outputs.push(output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Force refresh failed for {}: {}", collector.name(), e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Force refresh completed: {}/{} collectors in {}ms",
|
||||||
|
refreshed, self.cached_collectors.len(), start.elapsed().as_millis());
|
||||||
|
|
||||||
|
// Process status changes for refreshed data
|
||||||
|
for output in outputs {
|
||||||
|
self.check_status_changes(&output).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@ -69,6 +69,7 @@ pub struct App {
|
|||||||
active_host_index: usize,
|
active_host_index: usize,
|
||||||
show_help: bool,
|
show_help: bool,
|
||||||
should_quit: bool,
|
should_quit: bool,
|
||||||
|
refresh_requested: bool,
|
||||||
last_tick: Instant,
|
last_tick: Instant,
|
||||||
tick_count: u64,
|
tick_count: u64,
|
||||||
status: String,
|
status: String,
|
||||||
@ -106,6 +107,7 @@ impl App {
|
|||||||
active_host_index: 0,
|
active_host_index: 0,
|
||||||
show_help: false,
|
show_help: false,
|
||||||
should_quit: false,
|
should_quit: false,
|
||||||
|
refresh_requested: false,
|
||||||
last_tick: Instant::now(),
|
last_tick: Instant::now(),
|
||||||
tick_count: 0,
|
tick_count: 0,
|
||||||
status,
|
status,
|
||||||
@ -138,7 +140,8 @@ impl App {
|
|||||||
self.status = "Exiting…".to_string();
|
self.status = "Exiting…".to_string();
|
||||||
}
|
}
|
||||||
KeyCode::Char('r') | KeyCode::Char('R') => {
|
KeyCode::Char('r') | KeyCode::Char('R') => {
|
||||||
self.status = "Manual refresh requested".to_string();
|
self.refresh_requested = true;
|
||||||
|
self.status = "Refresh requested - sending commands to agents...".to_string();
|
||||||
}
|
}
|
||||||
KeyCode::Left | KeyCode::Char('h') => {
|
KeyCode::Left | KeyCode::Char('h') => {
|
||||||
self.select_previous_host();
|
self.select_previous_host();
|
||||||
@ -156,6 +159,15 @@ impl App {
|
|||||||
pub fn should_quit(&self) -> bool {
|
pub fn should_quit(&self) -> bool {
|
||||||
self.should_quit
|
self.should_quit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn check_refresh_request(&mut self) -> bool {
|
||||||
|
if self.refresh_requested {
|
||||||
|
self.refresh_requested = false;
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
pub fn status_text(&self) -> &str {
|
pub fn status_text(&self) -> &str {
|
||||||
@ -256,6 +268,10 @@ impl App {
|
|||||||
self.zmq_subscription.clone(),
|
self.zmq_subscription.clone(),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn zmq_endpoints(&self) -> &[String] {
|
||||||
|
&self.zmq_endpoints
|
||||||
|
}
|
||||||
|
|
||||||
pub fn handle_app_event(&mut self, event: AppEvent) {
|
pub fn handle_app_event(&mut self, event: AppEvent) {
|
||||||
match event {
|
match event {
|
||||||
@ -337,6 +353,10 @@ impl App {
|
|||||||
|
|
||||||
self.status = format!("Fetch failed • host: {} • {}", host, error);
|
self.status = format!("Fetch failed • host: {} • {}", host, error);
|
||||||
}
|
}
|
||||||
|
AppEvent::RefreshRequested => {
|
||||||
|
// Handle refresh command - will be implemented in the main loop
|
||||||
|
self.status = "Refresh command sent to all agents".to_string();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -641,5 +661,6 @@ pub enum AppEvent {
|
|||||||
error: String,
|
error: String,
|
||||||
timestamp: DateTime<Utc>,
|
timestamp: DateTime<Utc>,
|
||||||
},
|
},
|
||||||
|
RefreshRequested,
|
||||||
Shutdown,
|
Shutdown,
|
||||||
}
|
}
|
||||||
|
|||||||
@ -153,6 +153,12 @@ fn run_app(
|
|||||||
|
|
||||||
while !app.should_quit() {
|
while !app.should_quit() {
|
||||||
drain_app_events(app, event_rx);
|
drain_app_events(app, event_rx);
|
||||||
|
|
||||||
|
// Check for refresh requests
|
||||||
|
if app.check_refresh_request() {
|
||||||
|
send_refresh_commands(app)?;
|
||||||
|
}
|
||||||
|
|
||||||
terminal.draw(|frame| ui::render(frame, app))?;
|
terminal.draw(|frame| ui::render(frame, app))?;
|
||||||
|
|
||||||
if event::poll(tick_rate)? {
|
if event::poll(tick_rate)? {
|
||||||
@ -301,6 +307,58 @@ fn metrics_blocking_loop(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn send_refresh_commands(app: &mut App) -> Result<()> {
|
||||||
|
let endpoints = app.zmq_endpoints();
|
||||||
|
if endpoints.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let zmq_context = NativeZmqContext::new();
|
||||||
|
|
||||||
|
for endpoint in endpoints {
|
||||||
|
// Convert metrics endpoint (6130) to command endpoint (6131)
|
||||||
|
let command_endpoint = endpoint.replace(":6130", ":6131");
|
||||||
|
|
||||||
|
let socket = zmq_context.socket(zmq::REQ)?;
|
||||||
|
socket.set_linger(0)?;
|
||||||
|
socket.set_rcvtimeo(5000)?; // 5 second timeout
|
||||||
|
socket.set_sndtimeo(5000)?; // 5 second timeout
|
||||||
|
|
||||||
|
match socket.connect(&command_endpoint) {
|
||||||
|
Ok(()) => {
|
||||||
|
debug!("Sending refresh command to {}", command_endpoint);
|
||||||
|
|
||||||
|
match socket.send("refresh", 0) {
|
||||||
|
Ok(()) => {
|
||||||
|
// Wait for response
|
||||||
|
match socket.recv_string(0) {
|
||||||
|
Ok(Ok(response)) => {
|
||||||
|
debug!("Refresh response from {}: {}", command_endpoint, response);
|
||||||
|
// Update status via public method would be needed, for now just log
|
||||||
|
debug!("Refresh sent to agents - response: {}", response);
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
warn!("String conversion error from {}: {:?}", command_endpoint, e);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("No response from {}: {}", command_endpoint, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to send refresh to {}: {}", command_endpoint, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to connect to command endpoint {}: {}", command_endpoint, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn handle_zmq_message(
|
fn handle_zmq_message(
|
||||||
message: &NativeZmqMessage,
|
message: &NativeZmqMessage,
|
||||||
sender: &UnboundedSender<AppEvent>,
|
sender: &UnboundedSender<AppEvent>,
|
||||||
|
|||||||
@ -80,7 +80,7 @@ fn render_help(frame: &mut Frame, area: Rect) {
|
|||||||
let lines = vec![
|
let lines = vec![
|
||||||
Line::from("Keyboard Shortcuts"),
|
Line::from("Keyboard Shortcuts"),
|
||||||
Line::from("←/→ or h/l: Switch active host"),
|
Line::from("←/→ or h/l: Switch active host"),
|
||||||
Line::from("r: Manual refresh status"),
|
Line::from("r: Refresh all metrics"),
|
||||||
Line::from("?: Toggle this help"),
|
Line::from("?: Toggle this help"),
|
||||||
Line::from("q / Esc: Quit dashboard"),
|
Line::from("q / Esc: Quit dashboard"),
|
||||||
];
|
];
|
||||||
|
|||||||
152
test_smart_agent.sh
Executable file
152
test_smart_agent.sh
Executable file
@ -0,0 +1,152 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Test script for smart caching agent
|
||||||
|
# Debug why only System collector works but Services/SMART/Backup don't
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "=== CM Dashboard Smart Agent Debug Test ==="
|
||||||
|
echo "Testing smart caching implementation..."
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Build the agent first
|
||||||
|
echo "Building agent..."
|
||||||
|
OPENSSL_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev \
|
||||||
|
OPENSSL_LIB_DIR=/nix/store/0837wpkjb27cr70bi3pc4g2rw5v9r63l-openssl-3.5.2/lib \
|
||||||
|
OPENSSL_INCLUDE_DIR=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/include \
|
||||||
|
PKG_CONFIG_PATH=/nix/store/cz9k6nhxjppa1kmyf5npd0g8l89xzilw-openssl-3.5.2-dev/lib/pkgconfig \
|
||||||
|
OPENSSL_NO_VENDOR=1 cargo build --workspace --release
|
||||||
|
echo "✓ Build completed"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 1: Verify agent starts and shows all collectors
|
||||||
|
echo "Test 1: Agent startup and collector initialization"
|
||||||
|
timeout 15s ./target/release/cm-dashboard-agent -v 2>&1 | tee /tmp/agent_startup.log &
|
||||||
|
AGENT_PID=$!
|
||||||
|
sleep 8
|
||||||
|
|
||||||
|
if kill -0 $AGENT_PID 2>/dev/null; then
|
||||||
|
echo "✓ Smart agent started successfully"
|
||||||
|
kill $AGENT_PID 2>/dev/null || true
|
||||||
|
wait $AGENT_PID 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo "✗ Smart agent failed to start"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 2: Analyze startup logs for collector initialization
|
||||||
|
echo "Test 2: Collector initialization analysis"
|
||||||
|
echo "Looking for collector setup messages:"
|
||||||
|
grep -E "(monitoring|collector|initialized)" /tmp/agent_startup.log || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Looking for cache-related messages:"
|
||||||
|
grep -E "(cache|warming|tier)" /tmp/agent_startup.log || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Looking for error messages:"
|
||||||
|
grep -E "(error|failed|Error)" /tmp/agent_startup.log || true
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 3: Check if all expected collectors are mentioned
|
||||||
|
echo "Test 3: Expected collector verification"
|
||||||
|
EXPECTED_COLLECTORS=("SMART monitoring" "System monitoring" "Service monitoring" "Backup monitoring")
|
||||||
|
for collector in "${EXPECTED_COLLECTORS[@]}"; do
|
||||||
|
if grep -q "$collector" /tmp/agent_startup.log; then
|
||||||
|
echo "✓ Found: $collector"
|
||||||
|
else
|
||||||
|
echo "✗ Missing: $collector"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 4: ZMQ message inspection (run agent for 20 seconds and capture messages)
|
||||||
|
echo "Test 4: ZMQ message capture and analysis"
|
||||||
|
echo "Starting agent and capturing ZMQ messages for 20 seconds..."
|
||||||
|
|
||||||
|
# Start the agent in background
|
||||||
|
timeout 25s ./target/release/cm-dashboard-agent -v > /tmp/agent_output.log 2>&1 &
|
||||||
|
AGENT_PID=$!
|
||||||
|
|
||||||
|
# Give agent time to start and warm cache
|
||||||
|
sleep 5
|
||||||
|
|
||||||
|
# Use netcat or ss to check ZMQ port
|
||||||
|
echo "Checking ZMQ port 6130:"
|
||||||
|
ss -tlnp | grep 6130 || echo "ZMQ port not found"
|
||||||
|
|
||||||
|
# Monitor for a bit more
|
||||||
|
sleep 15
|
||||||
|
|
||||||
|
# Stop agent
|
||||||
|
if kill -0 $AGENT_PID 2>/dev/null; then
|
||||||
|
kill $AGENT_PID 2>/dev/null || true
|
||||||
|
wait $AGENT_PID 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Agent output analysis:"
|
||||||
|
echo "Total lines of output: $(wc -l < /tmp/agent_output.log)"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Cache-related messages:"
|
||||||
|
grep -E "(cache|Cache|warming|Warming|tier|Tier)" /tmp/agent_output.log | head -10 || echo "No cache messages found"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Collection messages:"
|
||||||
|
grep -E "(collection|Collection|collected|Collected)" /tmp/agent_output.log | head -10 || echo "No collection messages found"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Error messages:"
|
||||||
|
grep -E "(error|Error|failed|Failed)" /tmp/agent_output.log || echo "No errors found"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 5: Check tier assignment
|
||||||
|
echo "Test 5: Cache tier analysis"
|
||||||
|
echo "Searching for tier assignments in startup:"
|
||||||
|
grep -E "(RealTime|Fast|Medium|Slow|Static)" /tmp/agent_startup.log || echo "No tier information found"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 6: Collection interval analysis
|
||||||
|
echo "Test 6: Collection interval verification"
|
||||||
|
echo "Expected intervals:"
|
||||||
|
echo "- System (RealTime): 5 seconds"
|
||||||
|
echo "- Services (Medium): 5 minutes"
|
||||||
|
echo "- SMART (Slow): 15 minutes"
|
||||||
|
echo "- Backup (Slow): 15 minutes"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "Actual intervals found in logs:"
|
||||||
|
grep -E "(\d+\w+ intervals|\d+s intervals|\d+min intervals)" /tmp/agent_startup.log || echo "No interval information found"
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 7: Manual collector test (if possible)
|
||||||
|
echo "Test 7: Service discovery test"
|
||||||
|
echo "Checking what services would be discovered:"
|
||||||
|
if [ -f "./target/release/cm-dashboard-agent" ]; then
|
||||||
|
echo "Services that should be monitored:"
|
||||||
|
systemctl list-units --state=active --type=service | grep -E "(gitea|immich|postgres|unifi|vaultwarden|nginx|docker|ssh)" | head -5 || echo "No interesting services found"
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
|
# Test 8: Check for threading issues
|
||||||
|
echo "Test 8: Threading and async analysis"
|
||||||
|
echo "Looking for async/threading issues:"
|
||||||
|
grep -E "(tokio|async|await|thread)" /tmp/agent_output.log | head -5 || echo "No async-related messages"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "=== Test Summary ==="
|
||||||
|
echo "Agent startup log: /tmp/agent_startup.log"
|
||||||
|
echo "Agent runtime log: /tmp/agent_output.log"
|
||||||
|
echo
|
||||||
|
echo "Key findings:"
|
||||||
|
echo "1. Agent starts: $([ -f /tmp/agent_startup.log ] && echo "✓" || echo "✗")"
|
||||||
|
echo "2. Collectors found: $(grep -c "monitoring" /tmp/agent_startup.log 2>/dev/null || echo "0")"
|
||||||
|
echo "3. Cache messages: $(grep -c -i cache /tmp/agent_output.log 2>/dev/null || echo "0")"
|
||||||
|
echo "4. Errors found: $(grep -c -i error /tmp/agent_output.log 2>/dev/null || echo "0")"
|
||||||
|
echo
|
||||||
|
echo "Next steps if issues found:"
|
||||||
|
echo "- Check collector initialization in smart_agent.rs"
|
||||||
|
echo "- Verify cache tier assignments and intervals"
|
||||||
|
echo "- Debug collection scheduling in collect_tier() method"
|
||||||
|
echo "- Test individual collectors outside of smart caching"
|
||||||
Loading…
x
Reference in New Issue
Block a user