Testing
This commit is contained in:
@@ -6,6 +6,7 @@ pub mod backup;
|
||||
pub mod error;
|
||||
pub mod service;
|
||||
pub mod smart;
|
||||
pub mod system;
|
||||
|
||||
pub use error::CollectorError;
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use std::collections::HashMap;
|
||||
use std::process::Stdio;
|
||||
use std::time::Duration;
|
||||
use tokio::fs;
|
||||
@@ -284,33 +283,6 @@ impl ServiceCollector {
|
||||
Ok(0.0) // No limit or couldn't parse
|
||||
}
|
||||
|
||||
async fn get_system_memory_info(&self) -> Result<SystemMemoryInfo, CollectorError> {
|
||||
let meminfo =
|
||||
fs::read_to_string("/proc/meminfo")
|
||||
.await
|
||||
.map_err(|e| CollectorError::IoError {
|
||||
message: e.to_string(),
|
||||
})?;
|
||||
|
||||
let mut memory_info = HashMap::new();
|
||||
for line in meminfo.lines() {
|
||||
if let Some((key, value)) = line.split_once(':') {
|
||||
let value = value.trim().trim_end_matches(" kB");
|
||||
if let Ok(kb) = value.parse::<u64>() {
|
||||
memory_info.insert(key.to_string(), kb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let total_kb = memory_info.get("MemTotal").copied().unwrap_or(0);
|
||||
let available_kb = memory_info.get("MemAvailable").copied().unwrap_or(0);
|
||||
let used_kb = total_kb.saturating_sub(available_kb);
|
||||
|
||||
Ok(SystemMemoryInfo {
|
||||
total_mb: total_kb as f32 / 1024.0,
|
||||
used_mb: used_kb as f32 / 1024.0,
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_disk_usage(&self) -> Result<DiskUsage, CollectorError> {
|
||||
let output = Command::new("df")
|
||||
@@ -363,59 +335,9 @@ impl ServiceCollector {
|
||||
})
|
||||
}
|
||||
|
||||
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
|
||||
let loadavg =
|
||||
fs::read_to_string("/proc/loadavg")
|
||||
.await
|
||||
.map_err(|e| CollectorError::IoError {
|
||||
message: e.to_string(),
|
||||
})?;
|
||||
|
||||
let parts: Vec<&str> = loadavg.split_whitespace().collect();
|
||||
if parts.len() < 3 {
|
||||
return Err(CollectorError::ParseError {
|
||||
message: "Unexpected /proc/loadavg format".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
let parse = |s: &str| -> Result<f32, CollectorError> {
|
||||
s.parse::<f32>().map_err(|e| CollectorError::ParseError {
|
||||
message: format!("Failed to parse load average '{}': {}", s, e),
|
||||
})
|
||||
};
|
||||
|
||||
Ok((parse(parts[0])?, parse(parts[1])?, parse(parts[2])?))
|
||||
}
|
||||
|
||||
fn determine_cpu_status(&self, cpu_load_5: f32) -> String {
|
||||
if cpu_load_5 >= 8.0 {
|
||||
"critical".to_string()
|
||||
} else if cpu_load_5 >= 5.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn determine_memory_status(&self, usage_percent: f32) -> String {
|
||||
if usage_percent >= 95.0 {
|
||||
"critical".to_string()
|
||||
} else if usage_percent >= 80.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn determine_cpu_temp_status(&self, temp_c: f32) -> String {
|
||||
if temp_c >= 80.0 {
|
||||
"critical".to_string()
|
||||
} else if temp_c >= 70.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn determine_services_status(&self, healthy: usize, degraded: usize, failed: usize) -> String {
|
||||
if failed > 0 {
|
||||
@@ -429,84 +351,6 @@ impl ServiceCollector {
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_cpu_cstate_info(&self) -> Option<Vec<String>> {
|
||||
// Read C-state information to show all sleep state distributions
|
||||
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
||||
let mut total_time = 0u64;
|
||||
|
||||
// Check if C-state information is available
|
||||
if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await {
|
||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
||||
let state_path = entry.path();
|
||||
let name_path = state_path.join("name");
|
||||
let time_path = state_path.join("time");
|
||||
|
||||
if let (Ok(name), Ok(time_str)) = (
|
||||
fs::read_to_string(&name_path).await,
|
||||
fs::read_to_string(&time_path).await
|
||||
) {
|
||||
let name = name.trim().to_string();
|
||||
if let Ok(time) = time_str.trim().parse::<u64>() {
|
||||
total_time += time;
|
||||
cstate_times.push((name, time));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if total_time > 0 && !cstate_times.is_empty() {
|
||||
// Sort by time spent (highest first)
|
||||
cstate_times.sort_by(|a, b| b.1.cmp(&a.1));
|
||||
|
||||
// Format all C-states with percentages
|
||||
let mut result = Vec::new();
|
||||
for (name, time) in cstate_times {
|
||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
||||
result.push(format!("{}: {:.1}%", name, percent));
|
||||
}
|
||||
}
|
||||
|
||||
return Some(result);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
async fn get_cpu_temperature_c(&self) -> Option<f32> {
|
||||
let mut entries = fs::read_dir("/sys/class/thermal").await.ok()?;
|
||||
let mut fallback: Option<f32> = None;
|
||||
|
||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
||||
let path = entry.path();
|
||||
let type_path = path.join("type");
|
||||
let temp_path = path.join("temp");
|
||||
|
||||
let label = fs::read_to_string(&type_path).await.ok()?.to_lowercase();
|
||||
let raw = match fs::read_to_string(&temp_path).await {
|
||||
Ok(value) => value,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let milli: f32 = match raw.trim().parse() {
|
||||
Ok(value) => value,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
let temp_c = milli / 1000.0;
|
||||
if label.contains("cpu") || label.contains("pkg") {
|
||||
if temp_c > 0.0 {
|
||||
return Some(temp_c);
|
||||
}
|
||||
}
|
||||
|
||||
if fallback.is_none() && temp_c > 0.0 {
|
||||
fallback = Some(temp_c);
|
||||
}
|
||||
}
|
||||
|
||||
fallback
|
||||
}
|
||||
|
||||
async fn get_gpu_metrics(&self) -> (Option<f32>, Option<f32>) {
|
||||
let output = Command::new("nvidia-smi")
|
||||
@@ -983,43 +827,21 @@ impl Collector for ServiceCollector {
|
||||
}
|
||||
}
|
||||
|
||||
// Get system memory info for quota calculation
|
||||
let system_memory = self
|
||||
.get_system_memory_info()
|
||||
.await
|
||||
.unwrap_or(SystemMemoryInfo {
|
||||
total_mb: 0.0,
|
||||
used_mb: 0.0,
|
||||
});
|
||||
|
||||
let _disk_usage = self.get_disk_usage().await.unwrap_or(DiskUsage {
|
||||
total_gb: 0.0,
|
||||
used_gb: 0.0,
|
||||
});
|
||||
|
||||
let (cpu_load_1, cpu_load_5, cpu_load_15) =
|
||||
self.get_cpu_load().await.unwrap_or((0.0, 0.0, 0.0));
|
||||
let cpu_status = self.determine_cpu_status(cpu_load_5);
|
||||
|
||||
// Calculate memory usage percentage and status
|
||||
let memory_usage_percent = if system_memory.total_mb > 0.0 {
|
||||
(system_memory.used_mb / system_memory.total_mb) * 100.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let memory_status = self.determine_memory_status(memory_usage_percent);
|
||||
|
||||
// Calculate overall services status
|
||||
let services_status = self.determine_services_status(healthy, degraded, failed);
|
||||
|
||||
let cpu_cstate_info = self.get_cpu_cstate_info().await;
|
||||
let cpu_temp_c = self.get_cpu_temperature_c().await;
|
||||
let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp));
|
||||
let (gpu_load_percent, gpu_temp_c) = self.get_gpu_metrics().await;
|
||||
|
||||
// If no specific quotas are set, use system memory as reference
|
||||
// If no specific quotas are set, use a default value
|
||||
if total_memory_quota == 0.0 {
|
||||
total_memory_quota = system_memory.total_mb;
|
||||
total_memory_quota = 8192.0; // Default 8GB for quota calculation
|
||||
}
|
||||
|
||||
let service_metrics = json!({
|
||||
@@ -1030,18 +852,8 @@ impl Collector for ServiceCollector {
|
||||
"services_status": services_status,
|
||||
"memory_used_mb": total_memory_used,
|
||||
"memory_quota_mb": total_memory_quota,
|
||||
"system_memory_used_mb": system_memory.used_mb,
|
||||
"system_memory_total_mb": system_memory.total_mb,
|
||||
"memory_status": memory_status,
|
||||
"disk_used_gb": total_disk_used,
|
||||
"disk_total_gb": total_disk_used, // For services, total = used (no quota concept)
|
||||
"cpu_load_1": cpu_load_1,
|
||||
"cpu_load_5": cpu_load_5,
|
||||
"cpu_load_15": cpu_load_15,
|
||||
"cpu_status": cpu_status,
|
||||
"cpu_cstate": cpu_cstate_info,
|
||||
"cpu_temp_c": cpu_temp_c,
|
||||
"cpu_temp_status": cpu_temp_status,
|
||||
"gpu_load_percent": gpu_load_percent,
|
||||
"gpu_temp_c": gpu_temp_c,
|
||||
},
|
||||
@@ -1077,10 +889,6 @@ enum ServiceStatus {
|
||||
Stopped,
|
||||
}
|
||||
|
||||
struct SystemMemoryInfo {
|
||||
total_mb: f32,
|
||||
used_mb: f32,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
struct DiskUsage {
|
||||
|
||||
271
agent/src/collectors/system.rs
Normal file
271
agent/src/collectors/system.rs
Normal file
@@ -0,0 +1,271 @@
|
||||
use async_trait::async_trait;
|
||||
use serde_json::json;
|
||||
use std::time::Duration;
|
||||
use tokio::fs;
|
||||
use tokio::process::Command;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{Collector, CollectorError, CollectorOutput, AgentType};
|
||||
|
||||
pub struct SystemCollector {
|
||||
enabled: bool,
|
||||
interval: Duration,
|
||||
}
|
||||
|
||||
impl SystemCollector {
|
||||
pub fn new(enabled: bool, interval_ms: u64) -> Self {
|
||||
Self {
|
||||
enabled,
|
||||
interval: Duration::from_millis(interval_ms),
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_cpu_load(&self) -> Result<(f32, f32, f32), CollectorError> {
|
||||
let output = Command::new("uptime")
|
||||
.output()
|
||||
.await
|
||||
.map_err(|e| CollectorError::CommandFailed {
|
||||
command: "uptime".to_string(),
|
||||
message: e.to_string()
|
||||
})?;
|
||||
|
||||
let uptime_str = String::from_utf8_lossy(&output.stdout);
|
||||
|
||||
// Parse load averages from uptime output
|
||||
// Format with comma decimals: "... load average: 3,30, 3,17, 2,84"
|
||||
if let Some(load_part) = uptime_str.split("load average:").nth(1) {
|
||||
// Use regex or careful parsing for comma decimal separator locale
|
||||
let load_str = load_part.trim();
|
||||
// Split on ", " to separate the three load values
|
||||
let loads: Vec<&str> = load_str.split(", ").collect();
|
||||
if loads.len() >= 3 {
|
||||
let load_1 = loads[0].trim().replace(',', ".").parse::<f32>()
|
||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 1min load".to_string() })?;
|
||||
let load_5 = loads[1].trim().replace(',', ".").parse::<f32>()
|
||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 5min load".to_string() })?;
|
||||
let load_15 = loads[2].trim().replace(',', ".").parse::<f32>()
|
||||
.map_err(|_| CollectorError::ParseError { message: "Failed to parse 15min load".to_string() })?;
|
||||
|
||||
return Ok((load_1, load_5, load_15));
|
||||
}
|
||||
}
|
||||
|
||||
Err(CollectorError::ParseError { message: "Failed to parse load averages".to_string() })
|
||||
}
|
||||
|
||||
async fn get_cpu_temperature(&self) -> Option<f32> {
|
||||
// Try to find CPU-specific thermal zones first (x86_pkg_temp, coretemp, etc.)
|
||||
for i in 0..10 {
|
||||
let type_path = format!("/sys/class/thermal/thermal_zone{}/type", i);
|
||||
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
||||
|
||||
if let (Ok(zone_type), Ok(temp_str)) = (
|
||||
fs::read_to_string(&type_path).await,
|
||||
fs::read_to_string(&temp_path).await,
|
||||
) {
|
||||
let zone_type = zone_type.trim();
|
||||
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
||||
let temp_c = temp_millic / 1000.0;
|
||||
// Look for reasonable temperatures first
|
||||
if temp_c > 20.0 && temp_c < 150.0 {
|
||||
// Prefer CPU package temperature zones
|
||||
if zone_type == "x86_pkg_temp" || zone_type.contains("coretemp") {
|
||||
debug!("Found CPU temperature: {}°C from {} ({})", temp_c, temp_path, zone_type);
|
||||
return Some(temp_c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: try any reasonable temperature if no CPU-specific zone found
|
||||
for i in 0..10 {
|
||||
let temp_path = format!("/sys/class/thermal/thermal_zone{}/temp", i);
|
||||
if let Ok(temp_str) = fs::read_to_string(&temp_path).await {
|
||||
if let Ok(temp_millic) = temp_str.trim().parse::<f32>() {
|
||||
let temp_c = temp_millic / 1000.0;
|
||||
if temp_c > 20.0 && temp_c < 150.0 {
|
||||
debug!("Found fallback temperature: {}°C from {}", temp_c, temp_path);
|
||||
return Some(temp_c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
async fn get_memory_info(&self) -> Result<(f32, f32), CollectorError> {
|
||||
let meminfo = fs::read_to_string("/proc/meminfo")
|
||||
.await
|
||||
.map_err(|e| CollectorError::IoError { message: format!("Failed to read /proc/meminfo: {}", e) })?;
|
||||
|
||||
let mut total_kb = 0;
|
||||
let mut available_kb = 0;
|
||||
|
||||
for line in meminfo.lines() {
|
||||
if line.starts_with("MemTotal:") {
|
||||
if let Some(value) = line.split_whitespace().nth(1) {
|
||||
total_kb = value.parse::<u64>().unwrap_or(0);
|
||||
}
|
||||
} else if line.starts_with("MemAvailable:") {
|
||||
if let Some(value) = line.split_whitespace().nth(1) {
|
||||
available_kb = value.parse::<u64>().unwrap_or(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if total_kb == 0 {
|
||||
return Err(CollectorError::ParseError { message: "Could not parse total memory".to_string() });
|
||||
}
|
||||
|
||||
let total_mb = total_kb as f32 / 1024.0;
|
||||
let used_mb = total_mb - (available_kb as f32 / 1024.0);
|
||||
|
||||
Ok((used_mb, total_mb))
|
||||
}
|
||||
|
||||
async fn get_cpu_cstate_info(&self) -> Option<Vec<String>> {
|
||||
// Read C-state information to show all sleep state distributions
|
||||
let mut cstate_times: Vec<(String, u64)> = Vec::new();
|
||||
let mut total_time = 0u64;
|
||||
|
||||
// Check if C-state information is available
|
||||
if let Ok(mut entries) = fs::read_dir("/sys/devices/system/cpu/cpu0/cpuidle").await {
|
||||
while let Ok(Some(entry)) = entries.next_entry().await {
|
||||
let state_path = entry.path();
|
||||
let name_path = state_path.join("name");
|
||||
let time_path = state_path.join("time");
|
||||
|
||||
if let (Ok(name), Ok(time_str)) = (
|
||||
fs::read_to_string(&name_path).await,
|
||||
fs::read_to_string(&time_path).await
|
||||
) {
|
||||
let name = name.trim().to_string();
|
||||
if let Ok(time) = time_str.trim().parse::<u64>() {
|
||||
total_time += time;
|
||||
cstate_times.push((name, time));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if total_time > 0 && !cstate_times.is_empty() {
|
||||
// Sort by time spent (highest first)
|
||||
cstate_times.sort_by(|a, b| b.1.cmp(&a.1));
|
||||
|
||||
// Format all C-states with percentages
|
||||
let mut result = Vec::new();
|
||||
for (name, time) in cstate_times {
|
||||
let percent = (time as f32 / total_time as f32) * 100.0;
|
||||
if percent >= 0.1 { // Only show states with at least 0.1% time
|
||||
result.push(format!("{}: {:.1}%", name, percent));
|
||||
}
|
||||
}
|
||||
|
||||
return Some(result);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
fn determine_cpu_status(&self, cpu_load_5: f32) -> String {
|
||||
if cpu_load_5 >= 8.0 {
|
||||
"critical".to_string()
|
||||
} else if cpu_load_5 >= 5.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn determine_cpu_temp_status(&self, temp_c: f32) -> String {
|
||||
if temp_c >= 100.0 {
|
||||
"critical".to_string()
|
||||
} else if temp_c >= 100.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
fn determine_memory_status(&self, usage_percent: f32) -> String {
|
||||
if usage_percent >= 95.0 {
|
||||
"critical".to_string()
|
||||
} else if usage_percent >= 80.0 {
|
||||
"warning".to_string()
|
||||
} else {
|
||||
"ok".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Collector for SystemCollector {
|
||||
fn name(&self) -> &str {
|
||||
"system"
|
||||
}
|
||||
|
||||
fn agent_type(&self) -> AgentType {
|
||||
AgentType::System
|
||||
}
|
||||
|
||||
fn collect_interval(&self) -> Duration {
|
||||
self.interval
|
||||
}
|
||||
|
||||
async fn collect(&self) -> Result<CollectorOutput, CollectorError> {
|
||||
if !self.enabled {
|
||||
return Err(CollectorError::ConfigError { message: "SystemCollector disabled".to_string() });
|
||||
}
|
||||
|
||||
// Get CPU load averages
|
||||
let (cpu_load_1, cpu_load_5, cpu_load_15) = self.get_cpu_load().await?;
|
||||
let cpu_status = self.determine_cpu_status(cpu_load_5);
|
||||
|
||||
// Get CPU temperature (optional)
|
||||
let cpu_temp_c = self.get_cpu_temperature().await;
|
||||
let cpu_temp_status = cpu_temp_c.map(|temp| self.determine_cpu_temp_status(temp));
|
||||
|
||||
// Get memory information
|
||||
let (memory_used_mb, memory_total_mb) = self.get_memory_info().await?;
|
||||
let memory_usage_percent = (memory_used_mb / memory_total_mb) * 100.0;
|
||||
let memory_status = self.determine_memory_status(memory_usage_percent);
|
||||
|
||||
// Get C-state information (optional)
|
||||
let cpu_cstate_info = self.get_cpu_cstate_info().await;
|
||||
|
||||
let mut system_metrics = json!({
|
||||
"summary": {
|
||||
"cpu_load_1": cpu_load_1,
|
||||
"cpu_load_5": cpu_load_5,
|
||||
"cpu_load_15": cpu_load_15,
|
||||
"cpu_status": cpu_status,
|
||||
"memory_used_mb": memory_used_mb,
|
||||
"memory_total_mb": memory_total_mb,
|
||||
"memory_usage_percent": memory_usage_percent,
|
||||
"memory_status": memory_status,
|
||||
},
|
||||
"timestamp": chrono::Utc::now().timestamp() as u64,
|
||||
});
|
||||
|
||||
// Add optional metrics if available
|
||||
if let Some(temp) = cpu_temp_c {
|
||||
system_metrics["summary"]["cpu_temp_c"] = json!(temp);
|
||||
if let Some(status) = cpu_temp_status {
|
||||
system_metrics["summary"]["cpu_temp_status"] = json!(status);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cstates) = cpu_cstate_info {
|
||||
system_metrics["summary"]["cpu_cstate"] = json!(cstates);
|
||||
}
|
||||
|
||||
debug!("System metrics collected: CPU load {:.2}, Memory {:.1}%",
|
||||
cpu_load_5, memory_usage_percent);
|
||||
|
||||
Ok(CollectorOutput {
|
||||
agent_type: AgentType::System,
|
||||
data: system_metrics,
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user