cm-dashboard/agent/src/discovery.rs

445 lines
14 KiB
Rust

use std::collections::HashSet;
use std::process::Stdio;
use tokio::fs;
use tokio::process::Command;
use tracing::{debug, warn};
use crate::collectors::CollectorError;
pub struct AutoDiscovery;
impl AutoDiscovery {
/// Auto-detect storage devices suitable for SMART monitoring
pub async fn discover_storage_devices() -> Vec<String> {
let mut devices = Vec::new();
// Method 1: Try lsblk to find block devices
if let Ok(lsblk_devices) = Self::discover_via_lsblk().await {
devices.extend(lsblk_devices);
}
// Method 2: Scan /dev for common device patterns
if devices.is_empty() {
if let Ok(dev_devices) = Self::discover_via_dev_scan().await {
devices.extend(dev_devices);
}
}
// Method 3: Fallback to common device names
if devices.is_empty() {
devices = Self::fallback_device_names();
}
// Remove duplicates and sort
let mut unique_devices: Vec<String> = devices
.into_iter()
.collect::<HashSet<_>>()
.into_iter()
.collect();
unique_devices.sort();
debug!("Auto-detected storage devices: {:?}", unique_devices);
unique_devices
}
async fn discover_via_lsblk() -> Result<Vec<String>, CollectorError> {
let output = Command::new("/run/current-system/sw/bin/lsblk")
.args(["-d", "-o", "NAME,TYPE", "-n", "-r"])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "lsblk".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
return Err(CollectorError::CommandFailed {
command: "lsblk".to_string(),
message: String::from_utf8_lossy(&output.stderr).to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut devices = Vec::new();
for line in stdout.lines() {
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.len() >= 2 {
let device_name = parts[0];
let device_type = parts[1];
// Include disk type devices and filter out unwanted ones
if device_type == "disk" && Self::is_suitable_device(device_name) {
devices.push(device_name.to_string());
}
}
}
Ok(devices)
}
async fn discover_via_dev_scan() -> Result<Vec<String>, CollectorError> {
let mut devices = Vec::new();
// Read /dev directory
let mut dev_entries = fs::read_dir("/dev")
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?;
while let Some(entry) =
dev_entries
.next_entry()
.await
.map_err(|e| CollectorError::IoError {
message: e.to_string(),
})?
{
let file_name = entry.file_name();
let device_name = file_name.to_string_lossy();
if Self::is_suitable_device(&device_name) {
devices.push(device_name.to_string());
}
}
Ok(devices)
}
fn is_suitable_device(device_name: &str) -> bool {
// Include NVMe, SATA, and other storage devices
// Exclude partitions, loop devices, etc.
(device_name.starts_with("nvme") && device_name.contains("n") && !device_name.contains("p")) ||
(device_name.starts_with("sd") && device_name.len() == 3) || // sda, sdb, etc. not sda1
(device_name.starts_with("hd") && device_name.len() == 3) || // hda, hdb, etc.
(device_name.starts_with("vd") && device_name.len() == 3) // vda, vdb for VMs
}
fn fallback_device_names() -> Vec<String> {
vec!["nvme0n1".to_string(), "sda".to_string(), "sdb".to_string()]
}
/// Auto-detect systemd services suitable for monitoring
pub async fn discover_services() -> Vec<String> {
let mut services = Vec::new();
// Method 1: Try to find running services
if let Ok(running_services) = Self::discover_running_services().await {
services.extend(running_services);
}
// Method 2: Add host-specific services based on hostname
let hostname = gethostname::gethostname().to_string_lossy().to_string();
services.extend(Self::get_host_specific_services(&hostname));
// Normalize aliases and verify the units actually exist before deduping
let canonicalized: Vec<String> = services
.into_iter()
.filter_map(|svc| Self::canonical_service_name(&svc))
.collect();
let existing = Self::filter_existing_services(&canonicalized).await;
let mut unique_services: Vec<String> = existing
.into_iter()
.collect::<HashSet<_>>()
.into_iter()
.collect();
unique_services.sort();
debug!("Auto-detected services: {:?}", unique_services);
unique_services
}
async fn discover_running_services() -> Result<Vec<String>, CollectorError> {
let output = Command::new("/run/current-system/sw/bin/systemctl")
.args([
"list-units",
"--type=service",
"--state=active",
"--no-pager",
"--no-legend",
])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
.map_err(|e| CollectorError::CommandFailed {
command: "systemctl list-units".to_string(),
message: e.to_string(),
})?;
if !output.status.success() {
return Err(CollectorError::CommandFailed {
command: "systemctl list-units".to_string(),
message: String::from_utf8_lossy(&output.stderr).to_string(),
});
}
let stdout = String::from_utf8_lossy(&output.stdout);
let mut services = Vec::new();
for line in stdout.lines() {
let parts: Vec<&str> = line.split_whitespace().collect();
if !parts.is_empty() {
let service_name = parts[0];
// Remove .service suffix if present
let clean_name = service_name
.strip_suffix(".service")
.unwrap_or(service_name);
// Only include services we're interested in monitoring
if Self::is_monitorable_service(clean_name) {
services.push(clean_name.to_string());
}
}
}
Ok(services)
}
fn is_monitorable_service(service_name: &str) -> bool {
// Skip setup/certificate services that don't need monitoring
let excluded_services = [
"mosquitto-certs",
"immich-setup",
"phpfpm-kryddorten",
"phpfpm-mariehall2",
];
for excluded in &excluded_services {
if service_name.contains(excluded) {
return false;
}
}
// Define patterns for services we want to monitor
let interesting_services = [
// Web applications
"gitea",
"immich",
"vaultwarden",
"unifi",
"wordpress",
"nginx",
"httpd",
// Databases
"postgresql",
"mysql",
"mariadb",
"redis",
"mongodb",
"mongod",
// Backup and storage
"borg",
"rclone",
// Container runtimes
"docker",
// CI/CD services
"gitea-actions",
"gitea-runner",
"actions-runner",
// Network services
"sshd",
"dnsmasq",
// MQTT and IoT services
"mosquitto",
"mqtt",
// PHP-FPM services
"phpfpm",
// Home automation
"haasp",
// Backup services
"backup",
];
// Check if service name contains any of our interesting patterns
interesting_services
.iter()
.any(|&pattern| service_name.contains(pattern) || pattern.contains(service_name))
}
fn get_host_specific_services(_hostname: &str) -> Vec<String> {
// Pure auto-discovery - no hardcoded host-specific services
vec![]
}
fn canonical_service_name(service: &str) -> Option<String> {
let trimmed = service.trim();
if trimmed.is_empty() {
return None;
}
let lower = trimmed.to_lowercase();
let aliases = [
("ssh", "sshd"),
("sshd", "sshd"),
("docker.service", "docker"),
];
for (alias, target) in aliases {
if lower == alias {
return Some(target.to_string());
}
}
Some(trimmed.to_string())
}
async fn filter_existing_services(services: &[String]) -> Vec<String> {
let mut existing = Vec::new();
for service in services {
if Self::service_exists(service).await {
existing.push(service.clone());
}
}
existing
}
async fn service_exists(service: &str) -> bool {
let unit = if service.ends_with(".service") {
service.to_string()
} else {
format!("{}.service", service)
};
match Command::new("/run/current-system/sw/bin/systemctl")
.args(["status", &unit])
.stdout(Stdio::null())
.stderr(Stdio::null())
.output()
.await
{
Ok(output) => output.status.success(),
Err(error) => {
warn!("Failed to check service {}: {}", unit, error);
false
}
}
}
/// Auto-detect backup configuration
pub async fn discover_backup_config(hostname: &str) -> (bool, Option<String>, String) {
// Check if this host should have backup monitoring
let backup_enabled = hostname == "srv01" || Self::has_backup_service().await;
// Try to find restic repository
let restic_repo = if backup_enabled {
Self::discover_restic_repo().await
} else {
None
};
// Determine backup service name
let backup_service = Self::discover_backup_service()
.await
.unwrap_or_else(|| "restic-backup".to_string());
(backup_enabled, restic_repo, backup_service)
}
async fn has_backup_service() -> bool {
// Check for common backup services
let backup_services = ["restic", "borg", "duplicati", "rclone"];
for service in backup_services {
if let Ok(output) = Command::new("/run/current-system/sw/bin/systemctl")
.args(["is-enabled", service])
.output()
.await
{
if output.status.success() {
return true;
}
}
}
false
}
async fn discover_restic_repo() -> Option<String> {
// Common restic repository locations
let common_paths = [
"/srv/backups/restic",
"/var/backups/restic",
"/home/restic",
"/backup/restic",
"/mnt/backup/restic",
];
for path in common_paths {
if fs::metadata(path).await.is_ok() {
debug!("Found restic repository at: {}", path);
return Some(path.to_string());
}
}
// Try to find via environment variables or config files
if let Ok(content) = fs::read_to_string("/etc/restic/repository").await {
let repo_path = content.trim();
if !repo_path.is_empty() {
return Some(repo_path.to_string());
}
}
None
}
async fn discover_backup_service() -> Option<String> {
let backup_services = ["restic-backup", "restic", "borg-backup", "borg", "backup"];
for service in backup_services {
if let Ok(output) = Command::new("/run/current-system/sw/bin/systemctl")
.args(["is-enabled", &format!("{}.service", service)])
.output()
.await
{
if output.status.success() {
return Some(service.to_string());
}
}
}
None
}
/// Validate auto-detected configuration
pub async fn validate_devices(devices: &[String]) -> Vec<String> {
let mut valid_devices = Vec::new();
for device in devices {
if Self::can_access_device(device).await {
valid_devices.push(device.clone());
} else {
warn!("Cannot access device {}, skipping", device);
}
}
valid_devices
}
async fn can_access_device(device: &str) -> bool {
let device_path = format!("/dev/{}", device);
// Try to run smartctl to see if device is accessible
if let Ok(output) = Command::new("sudo")
.args(["/run/current-system/sw/bin/smartctl", "-i", &device_path])
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.await
{
// smartctl returns 0 for success, but may return other codes for warnings
// that are still acceptable (like device supports SMART but has some issues)
output.status.code().map_or(false, |code| code <= 4)
} else {
false
}
}
}