All checks were successful
Build and Release / build-and-release (push) Successful in 1m32s
- Agent now extracts "C" + digits pattern (C3, C10) using char parsing - Removes suffixes like "_ACPI", "_MWAIT" at source - Reduces JSON payload size over ZMQ - No regex dependency - uses fast char iteration (~1μs overhead) - Robust fallback to original name if pattern not found - Dashboard simplified to use clean names directly Bump version to v0.1.212 Co-Authored-By: Claude <noreply@anthropic.com>
226 lines
8.2 KiB
Rust
226 lines
8.2 KiB
Rust
use async_trait::async_trait;
|
|
use cm_dashboard_shared::{AgentData, Status, HysteresisThresholds};
|
|
|
|
use tracing::debug;
|
|
|
|
use super::{utils, Collector, CollectorError};
|
|
use crate::config::CpuConfig;
|
|
|
|
/// Extremely efficient CPU metrics collector
|
|
///
|
|
/// EFFICIENCY OPTIMIZATIONS:
|
|
/// - Single /proc/loadavg read for all load metrics
|
|
/// - Single /proc/stat read for CPU usage
|
|
/// - Minimal string allocations
|
|
/// - No process spawning
|
|
/// - <0.1ms collection time target
|
|
pub struct CpuCollector {
|
|
load_thresholds: HysteresisThresholds,
|
|
temperature_thresholds: HysteresisThresholds,
|
|
}
|
|
|
|
impl CpuCollector {
|
|
pub fn new(config: CpuConfig) -> Self {
|
|
// Create hysteresis thresholds with 10% gap for recovery
|
|
let load_thresholds = HysteresisThresholds::new(
|
|
config.load_warning_threshold,
|
|
config.load_critical_threshold,
|
|
);
|
|
|
|
let temperature_thresholds = HysteresisThresholds::new(
|
|
config.temperature_warning_threshold,
|
|
config.temperature_critical_threshold,
|
|
);
|
|
|
|
Self {
|
|
load_thresholds,
|
|
temperature_thresholds,
|
|
}
|
|
}
|
|
|
|
/// Calculate CPU load status using thresholds
|
|
fn calculate_load_status(&self, load: f32) -> Status {
|
|
if load >= self.load_thresholds.critical_high {
|
|
Status::Critical
|
|
} else if load >= self.load_thresholds.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
|
|
/// Calculate CPU temperature status using thresholds
|
|
fn calculate_temperature_status(&self, temp: f32) -> Status {
|
|
if temp >= self.temperature_thresholds.critical_high {
|
|
Status::Critical
|
|
} else if temp >= self.temperature_thresholds.warning_high {
|
|
Status::Warning
|
|
} else {
|
|
Status::Ok
|
|
}
|
|
}
|
|
|
|
/// Collect CPU load averages and populate AgentData
|
|
/// Format: "0.52 0.58 0.59 1/257 12345"
|
|
async fn collect_load_averages(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
let content = utils::read_proc_file("/proc/loadavg")?;
|
|
let parts: Vec<&str> = content.trim().split_whitespace().collect();
|
|
|
|
if parts.len() < 3 {
|
|
return Err(CollectorError::Parse {
|
|
value: content,
|
|
error: "Expected at least 3 values in /proc/loadavg".to_string(),
|
|
});
|
|
}
|
|
|
|
let load_1min = utils::parse_f32(parts[0])?;
|
|
let load_5min = utils::parse_f32(parts[1])?;
|
|
let load_15min = utils::parse_f32(parts[2])?;
|
|
|
|
// Populate CPU data directly
|
|
agent_data.system.cpu.load_1min = load_1min;
|
|
agent_data.system.cpu.load_5min = load_5min;
|
|
agent_data.system.cpu.load_15min = load_15min;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Collect CPU temperature and populate AgentData
|
|
/// Prioritizes x86_pkg_temp over generic thermal zones
|
|
async fn collect_temperature(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
// Try x86_pkg_temp first (Intel CPU package temperature)
|
|
if let Ok(temp) = self
|
|
.read_thermal_zone("/sys/class/thermal/thermal_zone0/temp")
|
|
.await
|
|
{
|
|
let temp_celsius = temp as f32 / 1000.0;
|
|
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
|
return Ok(());
|
|
}
|
|
|
|
// Fallback: try other thermal zones
|
|
for zone_id in 0..10 {
|
|
let path = format!("/sys/class/thermal/thermal_zone{}/temp", zone_id);
|
|
if let Ok(temp) = self.read_thermal_zone(&path).await {
|
|
let temp_celsius = temp as f32 / 1000.0;
|
|
agent_data.system.cpu.temperature_celsius = Some(temp_celsius);
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
debug!("No CPU temperature sensors found");
|
|
// Leave temperature as None if not available
|
|
Ok(())
|
|
}
|
|
|
|
/// Read temperature from thermal zone efficiently
|
|
async fn read_thermal_zone(&self, path: &str) -> Result<u64, CollectorError> {
|
|
let content = utils::read_proc_file(path)?;
|
|
utils::parse_u64(content.trim())
|
|
}
|
|
|
|
/// Collect CPU C-state (idle depth) and populate AgentData with top 3 C-states by usage
|
|
async fn collect_cstate(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
// Read C-state usage from first CPU (representative of overall system)
|
|
// C-states indicate CPU idle depth: C1=light sleep, C6=deep sleep, C10=deepest
|
|
|
|
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
|
let mut total_time: u64 = 0;
|
|
|
|
// Collect all C-state times from CPU0
|
|
for state_num in 0..=10 {
|
|
let time_path = format!("/sys/devices/system/cpu/cpu0/cpuidle/state{}/time", state_num);
|
|
let name_path = format!("/sys/devices/system/cpu/cpu0/cpuidle/state{}/name", state_num);
|
|
|
|
if let Ok(time_str) = utils::read_proc_file(&time_path) {
|
|
if let Ok(time) = utils::parse_u64(time_str.trim()) {
|
|
if let Ok(name) = utils::read_proc_file(&name_path) {
|
|
let state_name = name.trim();
|
|
// Skip POLL state (not real idle)
|
|
if state_name != "POLL" && time > 0 {
|
|
// Extract "C" + digits pattern (C3, C10, etc.) to reduce JSON size
|
|
// Handles formats like "C3_ACPI", "C10_MWAIT", etc.
|
|
let clean_name = if let Some(c_pos) = state_name.find('C') {
|
|
let rest = &state_name[c_pos + 1..];
|
|
let digit_count = rest.chars().take_while(|c| c.is_ascii_digit()).count();
|
|
if digit_count > 0 {
|
|
state_name[c_pos..c_pos + 1 + digit_count].to_string()
|
|
} else {
|
|
state_name.to_string()
|
|
}
|
|
} else {
|
|
state_name.to_string()
|
|
};
|
|
cstate_times.push((clean_name, time));
|
|
total_time += time;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// No more states available
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Sort by time descending to get top 3
|
|
cstate_times.sort_by(|a, b| b.1.cmp(&a.1));
|
|
|
|
// Calculate percentages for top 3 and populate AgentData
|
|
agent_data.system.cpu.cstates = cstate_times
|
|
.iter()
|
|
.take(3)
|
|
.map(|(name, time)| {
|
|
let percent = if total_time > 0 {
|
|
(*time as f32 / total_time as f32) * 100.0
|
|
} else {
|
|
0.0
|
|
};
|
|
cm_dashboard_shared::CStateInfo {
|
|
name: name.clone(),
|
|
percent,
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Collector for CpuCollector {
|
|
async fn collect_structured(&self, agent_data: &mut AgentData) -> Result<(), CollectorError> {
|
|
debug!("Collecting CPU metrics");
|
|
let start = std::time::Instant::now();
|
|
|
|
// Collect load averages (always available)
|
|
self.collect_load_averages(agent_data).await?;
|
|
|
|
// Collect temperature (optional)
|
|
self.collect_temperature(agent_data).await?;
|
|
|
|
// Collect C-state (CPU idle depth)
|
|
self.collect_cstate(agent_data).await?;
|
|
|
|
let duration = start.elapsed();
|
|
debug!("CPU collection completed in {:?}", duration);
|
|
|
|
// Efficiency check: warn if collection takes too long
|
|
if duration.as_millis() > 1 {
|
|
debug!(
|
|
"CPU collection took {}ms - consider optimization",
|
|
duration.as_millis()
|
|
);
|
|
}
|
|
|
|
// Calculate status using thresholds
|
|
agent_data.system.cpu.load_status = self.calculate_load_status(agent_data.system.cpu.load_1min);
|
|
agent_data.system.cpu.temperature_status = if let Some(temp) = agent_data.system.cpu.temperature_celsius {
|
|
self.calculate_temperature_status(temp)
|
|
} else {
|
|
Status::Unknown
|
|
};
|
|
|
|
Ok(())
|
|
}
|
|
}
|