Fix nginx site monitoring to properly detect errors

- Return error status for HTTP 502/5xx responses instead of success
- Show 'error' description for sites with connectivity but wrong status codes
- Show 'unreachable' description for complete connection failures
- Each nginx site now has independent status based on actual health
- Sites with timeouts or server errors will trigger notifications
This commit is contained in:
Christoffer Martinsson 2025-10-14 20:53:07 +02:00
parent e64527ce2f
commit 355a986582

View File

@ -832,7 +832,8 @@ impl ServiceCollector {
std::env::var("UID").unwrap_or_default() == "0"
}
async fn measure_site_latency(&self, site_name: &str) -> Option<f32> {
async fn measure_site_latency(&self, site_name: &str) -> (Option<f32>, bool) {
// Returns (latency, is_healthy)
// Construct URL from site name
let url = if site_name.contains("localhost") || site_name.contains("127.0.0.1") {
format!("http://{}", site_name)
@ -841,10 +842,13 @@ impl ServiceCollector {
};
// Create HTTP client with short timeout
let client = reqwest::Client::builder()
let client = match reqwest::Client::builder()
.timeout(Duration::from_secs(5))
.build()
.ok()?;
{
Ok(client) => client,
Err(_) => return (None, false),
};
let start = Instant::now();
@ -852,16 +856,12 @@ impl ServiceCollector {
match client.head(&url).send().await {
Ok(response) => {
let latency = start.elapsed().as_millis() as f32;
if response.status().is_success() || response.status().is_redirection() {
Some(latency)
} else {
// Site is reachable but returned error, still measure latency
Some(latency)
}
let is_healthy = response.status().is_success() || response.status().is_redirection();
(Some(latency), is_healthy)
}
Err(_) => {
// Connection failed, no latency measurement
None
// Connection failed, no latency measurement, not healthy
(None, false)
}
}
}
@ -1355,13 +1355,14 @@ impl Collector for ServiceCollector {
// Add nginx sites as individual sub-services
if let Some(sites) = self.get_nginx_sites().await {
for site in sites.iter() {
// Measure latency for this site
let latency = self.measure_site_latency(site).await;
// Measure latency and health for this site
let (latency, is_healthy) = self.measure_site_latency(site).await;
// Determine status and description based on latency measurement
let (site_status, site_description) = match latency {
Some(_ms) => (ServiceStatus::Running, None),
None => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
// Determine status and description based on latency and health
let (site_status, site_description) = match (latency, is_healthy) {
(Some(_ms), true) => (ServiceStatus::Running, None),
(Some(_ms), false) => (ServiceStatus::Stopped, Some(vec!["error".to_string()])),
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
};
// Update counters based on site status