Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e3996fdb84 | |||
| f94ca60e69 | |||
| c19ff56df8 | |||
| fe2f604703 | |||
| 8bfd416327 | |||
| 85c6c624fb | |||
| eab3f17428 | |||
| 7ad149bbe4 | |||
| b444c88ea0 |
50
CLAUDE.md
50
CLAUDE.md
@@ -156,56 +156,6 @@ Complete migration from string-based metrics to structured JSON data. Eliminates
|
||||
- ✅ Backward compatibility via bridge conversion to existing UI widgets
|
||||
- ✅ All string parsing bugs eliminated
|
||||
|
||||
### Cached Collector Architecture (🚧 PLANNED)
|
||||
|
||||
**Problem:** Blocking collectors prevent timely ZMQ transmission, causing false "host offline" alerts.
|
||||
|
||||
**Previous (Sequential Blocking):**
|
||||
```
|
||||
Every 1 second:
|
||||
└─ collect_all_data() [BLOCKS for 2-10+ seconds]
|
||||
├─ CPU (fast: 10ms)
|
||||
├─ Memory (fast: 20ms)
|
||||
├─ Disk SMART (slow: 3s per drive × 4 drives = 12s)
|
||||
├─ Service disk usage (slow: 2-8s per service)
|
||||
└─ Docker (medium: 500ms)
|
||||
└─ send_via_zmq() [Only after ALL collection completes]
|
||||
|
||||
Result: If any collector takes >10s → "host offline" false alert
|
||||
```
|
||||
|
||||
**New (Cached Independent Collectors):**
|
||||
```
|
||||
Shared Cache: Arc<RwLock<AgentData>>
|
||||
|
||||
Background Collectors (independent async tasks):
|
||||
├─ Fast collectors (CPU, RAM, Network)
|
||||
│ └─ Update cache every 1 second
|
||||
├─ Medium collectors (Services, Docker)
|
||||
│ └─ Update cache every 5 seconds
|
||||
└─ Slow collectors (Disk usage, SMART data)
|
||||
└─ Update cache every 60 seconds
|
||||
|
||||
ZMQ Sender (separate async task):
|
||||
Every 1 second:
|
||||
└─ Read current cache
|
||||
└─ Send via ZMQ [Always instant, never blocked]
|
||||
```
|
||||
|
||||
**Benefits:**
|
||||
- ✅ ZMQ sends every 1 second regardless of collector speed
|
||||
- ✅ No false "host offline" alerts from slow collectors
|
||||
- ✅ Different update rates for different metrics (CPU=1s, SMART=60s)
|
||||
- ✅ System stays responsive even with slow operations
|
||||
- ✅ Slow collectors can use longer timeouts without blocking
|
||||
|
||||
**Implementation:**
|
||||
- Shared `AgentData` cache wrapped in `Arc<RwLock<>>`
|
||||
- Each collector spawned as independent tokio task
|
||||
- Collectors update their section of cache at their own rate
|
||||
- ZMQ sender reads cache every 1s and transmits
|
||||
- Stale data acceptable for slow-changing metrics (disk usage, SMART)
|
||||
|
||||
### Maintenance Mode
|
||||
|
||||
- Agent checks for `/tmp/cm-maintenance` file before sending notifications
|
||||
|
||||
57
Cargo.lock
generated
57
Cargo.lock
generated
@@ -1,5 +1,6 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
# This file is automatically generated by Cargo.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
@@ -164,9 +165,9 @@ checksum = "df8670b8c7b9dae1793364eafadf7239c40d669904660c5960d74cfd80b46a53"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.47"
|
||||
version = "1.2.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd405d82c84ff7f35739f175f67d8b9fb7687a0e84ccdc78bd3568839827cf07"
|
||||
checksum = "b97463e1064cb1b1c1384ad0a0b9c8abd0988e2a91f52606c80ef14aadb63e36"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"jobserver",
|
||||
@@ -238,9 +239,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.5.53"
|
||||
version = "4.5.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9e340e012a1bf4935f5282ed1436d1489548e8f72308207ea5df0e23d2d03f8"
|
||||
checksum = "aa8120877db0e5c011242f96806ce3c94e0737ab8108532a76a3300a01db2ab8"
|
||||
dependencies = [
|
||||
"clap_builder",
|
||||
"clap_derive",
|
||||
@@ -248,9 +249,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap_builder"
|
||||
version = "4.5.53"
|
||||
version = "4.5.52"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d76b5d13eaa18c901fd2f7fca939fefe3a0727a953561fefdf3b2922b8569d00"
|
||||
checksum = "02576b399397b659c26064fbc92a75fede9d18ffd5f80ca1cd74ddab167016e1"
|
||||
dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
@@ -278,7 +279,7 @@ checksum = "a1d728cc89cf3aee9ff92b05e62b19ee65a02b5702cff7d5a377e32c6ae29d8d"
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
@@ -300,7 +301,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"async-trait",
|
||||
@@ -323,7 +324,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"serde",
|
||||
@@ -663,9 +664,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.16.1"
|
||||
version = "0.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100"
|
||||
checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
@@ -878,12 +879,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.12.1"
|
||||
version = "2.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0ad4bb2b565bca0645f4d68c5c9af97fba094e9791da685bf83cb5f3ce74acf2"
|
||||
checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.16.1",
|
||||
"hashbrown 0.16.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1638,9 +1639,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook-registry"
|
||||
version = "1.4.7"
|
||||
version = "1.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7664a098b8e616bdfcc2dc0e9ac44eb231eedf41db4e9fe95d8d32ec728dedad"
|
||||
checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
@@ -1732,9 +1733,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.111"
|
||||
version = "2.0.110"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "390cc9a294ab71bdb1aa2e99d13be9c753cd2d7bd6560c77118597410c4d2e87"
|
||||
checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1961,9 +1962,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-attributes"
|
||||
version = "0.1.31"
|
||||
version = "0.1.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
|
||||
checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1972,9 +1973,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tracing-core"
|
||||
version = "0.1.35"
|
||||
version = "0.1.34"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a04e24fab5c89c6a36eb8558c9656f30d81de51dfa4d3b45f26b21d61fa0a6c"
|
||||
checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
@@ -2512,9 +2513,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.14"
|
||||
version = "0.7.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
|
||||
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
@@ -2566,18 +2567,18 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.30"
|
||||
version = "0.8.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ea879c944afe8a2b25fef16bb4ba234f47c694565e97383b36f3a878219065c"
|
||||
checksum = "0894878a5fa3edfd6da3f88c4805f4c8558e2b996227a3d864f47fe11e38282c"
|
||||
dependencies = [
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.8.30"
|
||||
version = "0.8.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf955aa904d6040f70dc8e9384444cb1030aed272ba3cb09bbc4ab9e7c1f34f5"
|
||||
checksum = "88d2b8d9c68ad2b9e4340d7832716a4d21a22a1154777ad56ea55c51a9cf3831"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-agent"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::time::Duration;
|
||||
use tokio::time::interval;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::communication::{AgentCommand, ZmqHandler};
|
||||
use crate::communication::ZmqHandler;
|
||||
use crate::config::AgentConfig;
|
||||
use crate::collectors::{
|
||||
Collector,
|
||||
@@ -134,12 +134,6 @@ impl Agent {
|
||||
// NOTE: With structured data, we might need to implement status tracking differently
|
||||
// For now, we skip this until status evaluation is migrated
|
||||
}
|
||||
// Handle incoming commands (check periodically)
|
||||
_ = tokio::time::sleep(Duration::from_millis(100)) => {
|
||||
if let Err(e) = self.handle_commands().await {
|
||||
error!("Error handling commands: {}", e);
|
||||
}
|
||||
}
|
||||
_ = &mut shutdown_rx => {
|
||||
info!("Shutdown signal received, stopping agent loop");
|
||||
break;
|
||||
@@ -259,36 +253,4 @@ impl Agent {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Handle incoming commands from dashboard
|
||||
async fn handle_commands(&mut self) -> Result<()> {
|
||||
// Try to receive a command (non-blocking)
|
||||
if let Ok(Some(command)) = self.zmq_handler.try_receive_command() {
|
||||
info!("Received command: {:?}", command);
|
||||
|
||||
match command {
|
||||
AgentCommand::CollectNow => {
|
||||
info!("Received immediate collection request");
|
||||
if let Err(e) = self.collect_and_broadcast().await {
|
||||
error!("Failed to collect on demand: {}", e);
|
||||
}
|
||||
}
|
||||
AgentCommand::SetInterval { seconds } => {
|
||||
info!("Received interval change request: {}s", seconds);
|
||||
// Note: This would require more complex handling to update the interval
|
||||
// For now, just acknowledge
|
||||
}
|
||||
AgentCommand::ToggleCollector { name, enabled } => {
|
||||
info!("Received collector toggle request: {} -> {}", name, enabled);
|
||||
// Note: This would require more complex handling to enable/disable collectors
|
||||
// For now, just acknowledge
|
||||
}
|
||||
AgentCommand::Ping => {
|
||||
info!("Received ping command");
|
||||
// Maybe send back a pong or status
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
}
|
||||
@@ -5,10 +5,9 @@ use zmq::{Context, Socket, SocketType};
|
||||
|
||||
use crate::config::ZmqConfig;
|
||||
|
||||
/// ZMQ communication handler for publishing metrics and receiving commands
|
||||
/// ZMQ communication handler for publishing metrics
|
||||
pub struct ZmqHandler {
|
||||
publisher: Socket,
|
||||
command_receiver: Socket,
|
||||
}
|
||||
|
||||
impl ZmqHandler {
|
||||
@@ -26,20 +25,8 @@ impl ZmqHandler {
|
||||
publisher.set_sndhwm(1000)?; // High water mark for outbound messages
|
||||
publisher.set_linger(1000)?; // Linger time on close
|
||||
|
||||
// Create command receiver socket (PULL socket to receive commands from dashboard)
|
||||
let command_receiver = context.socket(SocketType::PULL)?;
|
||||
let cmd_bind_address = format!("tcp://{}:{}", config.bind_address, config.command_port);
|
||||
command_receiver.bind(&cmd_bind_address)?;
|
||||
|
||||
info!("ZMQ command receiver bound to {}", cmd_bind_address);
|
||||
|
||||
// Set non-blocking mode for command receiver
|
||||
command_receiver.set_rcvtimeo(0)?; // Non-blocking receive
|
||||
command_receiver.set_linger(1000)?;
|
||||
|
||||
Ok(Self {
|
||||
publisher,
|
||||
command_receiver,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -65,36 +52,4 @@ impl ZmqHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Try to receive a command (non-blocking)
|
||||
pub fn try_receive_command(&self) -> Result<Option<AgentCommand>> {
|
||||
match self.command_receiver.recv_bytes(zmq::DONTWAIT) {
|
||||
Ok(bytes) => {
|
||||
debug!("Received command message ({} bytes)", bytes.len());
|
||||
|
||||
let command: AgentCommand = serde_json::from_slice(&bytes)
|
||||
.map_err(|e| anyhow::anyhow!("Failed to deserialize command: {}", e))?;
|
||||
|
||||
debug!("Parsed command: {:?}", command);
|
||||
Ok(Some(command))
|
||||
}
|
||||
Err(zmq::Error::EAGAIN) => {
|
||||
// No message available (non-blocking)
|
||||
Ok(None)
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!("ZMQ receive error: {}", e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Commands that can be sent to the agent
|
||||
#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
|
||||
pub enum AgentCommand {
|
||||
/// Request immediate metric collection
|
||||
CollectNow,
|
||||
/// Change collection interval
|
||||
SetInterval { seconds: u64 },
|
||||
/// Enable/disable a collector
|
||||
ToggleCollector { name: String, enabled: bool },
|
||||
/// Request status/health check
|
||||
Ping,
|
||||
}
|
||||
|
||||
@@ -20,7 +20,6 @@ pub struct AgentConfig {
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ZmqConfig {
|
||||
pub publisher_port: u16,
|
||||
pub command_port: u16,
|
||||
pub bind_address: String,
|
||||
pub transmission_interval_seconds: u64,
|
||||
/// Heartbeat transmission interval in seconds for host connectivity detection
|
||||
|
||||
@@ -7,14 +7,6 @@ pub fn validate_config(config: &AgentConfig) -> Result<()> {
|
||||
bail!("ZMQ publisher port cannot be 0");
|
||||
}
|
||||
|
||||
if config.zmq.command_port == 0 {
|
||||
bail!("ZMQ command port cannot be 0");
|
||||
}
|
||||
|
||||
if config.zmq.publisher_port == config.zmq.command_port {
|
||||
bail!("ZMQ publisher and command ports cannot be the same");
|
||||
}
|
||||
|
||||
if config.zmq.bind_address.is_empty() {
|
||||
bail!("ZMQ bind address cannot be empty");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "cm-dashboard-shared"
|
||||
version = "0.1.196"
|
||||
version = "0.1.203"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
|
||||
Reference in New Issue
Block a user