Fix nginx site monitoring to properly detect errors

- Return error status for HTTP 502/5xx responses instead of success
- Show 'error' description for sites with connectivity but wrong status codes
- Show 'unreachable' description for complete connection failures
- Each nginx site now has independent status based on actual health
- Sites with timeouts or server errors will trigger notifications
This commit is contained in:
Christoffer Martinsson 2025-10-14 20:53:07 +02:00
parent e64527ce2f
commit 355a986582

View File

@ -832,7 +832,8 @@ impl ServiceCollector {
std::env::var("UID").unwrap_or_default() == "0" std::env::var("UID").unwrap_or_default() == "0"
} }
async fn measure_site_latency(&self, site_name: &str) -> Option<f32> { async fn measure_site_latency(&self, site_name: &str) -> (Option<f32>, bool) {
// Returns (latency, is_healthy)
// Construct URL from site name // Construct URL from site name
let url = if site_name.contains("localhost") || site_name.contains("127.0.0.1") { let url = if site_name.contains("localhost") || site_name.contains("127.0.0.1") {
format!("http://{}", site_name) format!("http://{}", site_name)
@ -841,10 +842,13 @@ impl ServiceCollector {
}; };
// Create HTTP client with short timeout // Create HTTP client with short timeout
let client = reqwest::Client::builder() let client = match reqwest::Client::builder()
.timeout(Duration::from_secs(5)) .timeout(Duration::from_secs(5))
.build() .build()
.ok()?; {
Ok(client) => client,
Err(_) => return (None, false),
};
let start = Instant::now(); let start = Instant::now();
@ -852,16 +856,12 @@ impl ServiceCollector {
match client.head(&url).send().await { match client.head(&url).send().await {
Ok(response) => { Ok(response) => {
let latency = start.elapsed().as_millis() as f32; let latency = start.elapsed().as_millis() as f32;
if response.status().is_success() || response.status().is_redirection() { let is_healthy = response.status().is_success() || response.status().is_redirection();
Some(latency) (Some(latency), is_healthy)
} else {
// Site is reachable but returned error, still measure latency
Some(latency)
}
} }
Err(_) => { Err(_) => {
// Connection failed, no latency measurement // Connection failed, no latency measurement, not healthy
None (None, false)
} }
} }
} }
@ -1355,13 +1355,14 @@ impl Collector for ServiceCollector {
// Add nginx sites as individual sub-services // Add nginx sites as individual sub-services
if let Some(sites) = self.get_nginx_sites().await { if let Some(sites) = self.get_nginx_sites().await {
for site in sites.iter() { for site in sites.iter() {
// Measure latency for this site // Measure latency and health for this site
let latency = self.measure_site_latency(site).await; let (latency, is_healthy) = self.measure_site_latency(site).await;
// Determine status and description based on latency measurement // Determine status and description based on latency and health
let (site_status, site_description) = match latency { let (site_status, site_description) = match (latency, is_healthy) {
Some(_ms) => (ServiceStatus::Running, None), (Some(_ms), true) => (ServiceStatus::Running, None),
None => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])), (Some(_ms), false) => (ServiceStatus::Stopped, Some(vec!["error".to_string()])),
(None, _) => (ServiceStatus::Stopped, Some(vec!["unreachable".to_string()])),
}; };
// Update counters based on site status // Update counters based on site status