fix network metrics & grafana dashboard

number of top process consumer in config
This commit is contained in:
2025-11-07 20:24:51 +01:00
parent 9bb3f113db
commit 2e950506b7
6 changed files with 99 additions and 39 deletions

View File

@@ -66,12 +66,12 @@
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [ "targets": [
{ {
"expr": "system_network_rx_bytes_rate", "expr": "system_network_rx_bytes_per_sec",
"legendFormat": "RX - {{interface}}", "legendFormat": "RX - {{interface}}",
"refId": "RX" "refId": "RX"
}, },
{ {
"expr": "system_network_tx_bytes_rate", "expr": "system_network_tx_bytes_per_sec",
"legendFormat": "TX - {{interface}}", "legendFormat": "TX - {{interface}}",
"refId": "TX" "refId": "TX"
} }

View File

@@ -9,7 +9,7 @@
# Filter mode: "whitelist" or "blacklist" # Filter mode: "whitelist" or "blacklist"
# - whitelist: Only export metrics for processes in the lists below # - whitelist: Only export metrics for processes in the lists below
# - blacklist: Export metrics for all processes EXCEPT those in the lists # - blacklist: Export metrics for all processes EXCEPT those in the lists
filter_mode = "whitelist" filter_mode = "blacklist"
# Process names to monitor (case-insensitive substring match) # Process names to monitor (case-insensitive substring match)
# Examples for common server processes: # Examples for common server processes:

View File

@@ -1,5 +1,7 @@
use crate::config::MetricsConfig; use crate::config::MetricsConfig;
use anyhow::Result; use anyhow::Result;
use std::collections::HashMap;
use std::time::Instant;
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System}; use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
/// System metrics collected at a point in time /// System metrics collected at a point in time
@@ -30,8 +32,8 @@ pub struct MemoryMetric {
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct NetworkMetric { pub struct NetworkMetric {
pub interface_name: String, pub interface_name: String,
pub rx_bytes_total: u64, pub rx_bytes_per_sec: u64,
pub tx_bytes_total: u64, pub tx_bytes_per_sec: u64,
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -62,6 +64,9 @@ pub struct MetricsCollector {
networks: Networks, networks: Networks,
disks: Disks, disks: Disks,
config: MetricsConfig, config: MetricsConfig,
// Network rate calculation state
last_network_stats: HashMap<String, (u64, u64)>, // interface -> (rx_bytes, tx_bytes)
last_network_time: Option<Instant>,
} }
impl MetricsCollector { impl MetricsCollector {
@@ -76,6 +81,8 @@ impl MetricsCollector {
networks: Networks::new_with_refreshed_list(), networks: Networks::new_with_refreshed_list(),
disks: Disks::new_with_refreshed_list(), disks: Disks::new_with_refreshed_list(),
config, config,
last_network_stats: HashMap::new(),
last_network_time: None,
} }
} }
@@ -140,15 +147,62 @@ impl MetricsCollector {
} }
} }
fn collect_network(&self) -> Vec<NetworkMetric> { fn collect_network(&mut self) -> Vec<NetworkMetric> {
self.networks let now = Instant::now();
.iter() let mut metrics = Vec::new();
.map(|(interface_name, data)| NetworkMetric {
// Calculate time delta
let time_delta_secs = if let Some(last_time) = self.last_network_time {
now.duration_since(last_time).as_secs_f64()
} else {
// First collection, no rate to calculate
self.last_network_time = Some(now);
for (interface_name, data) in self.networks.iter() {
self.last_network_stats.insert(
interface_name.to_string(),
(data.total_received(), data.total_transmitted()),
);
}
return metrics; // Return empty on first run
};
// Update timestamp
self.last_network_time = Some(now);
// Calculate rates for each interface
for (interface_name, data) in self.networks.iter() {
let rx_total = data.total_received();
let tx_total = data.total_transmitted();
if let Some(&(last_rx, last_tx)) = self.last_network_stats.get(interface_name.as_str()) {
// Calculate bytes per second
let rx_bytes_per_sec = if rx_total >= last_rx {
((rx_total - last_rx) as f64 / time_delta_secs) as u64
} else {
0 // Counter wrapped or interface reset
};
let tx_bytes_per_sec = if tx_total >= last_tx {
((tx_total - last_tx) as f64 / time_delta_secs) as u64
} else {
0 // Counter wrapped or interface reset
};
metrics.push(NetworkMetric {
interface_name: interface_name.to_string(), interface_name: interface_name.to_string(),
rx_bytes_total: data.total_received(), rx_bytes_per_sec,
tx_bytes_total: data.total_transmitted(), tx_bytes_per_sec,
}) });
.collect() }
// Update last stats
self.last_network_stats.insert(
interface_name.to_string(),
(rx_total, tx_total),
);
}
metrics
} }
fn collect_disk(&self) -> Vec<DiskMetric> { fn collect_disk(&self) -> Vec<DiskMetric> {
@@ -194,13 +248,17 @@ impl MetricsCollector {
}) })
.collect(); .collect();
// Sort by CPU usage and limit to top 10 // Sort by CPU usage and limit to top N (configurable)
processes.sort_by(|a, b| { processes.sort_by(|a, b| {
b.cpu_usage_percent b.cpu_usage_percent
.partial_cmp(&a.cpu_usage_percent) .partial_cmp(&a.cpu_usage_percent)
.unwrap_or(std::cmp::Ordering::Equal) .unwrap_or(std::cmp::Ordering::Equal)
}); });
processes.truncate(10);
let max_processes = filter
.map(|f| f.max_processes)
.unwrap_or(10);
processes.truncate(max_processes);
processes processes
} }

View File

@@ -195,6 +195,10 @@ pub struct ProcessFilterConfig {
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub filter_mode: Option<ProcessFilterMode>, pub filter_mode: Option<ProcessFilterMode>,
/// Maximum number of processes to report (top N by CPU usage)
#[serde(default = "default_max_processes")]
pub max_processes: usize,
/// List of process names to filter (case-insensitive substring match) /// List of process names to filter (case-insensitive substring match)
#[serde(default)] #[serde(default)]
pub names: Vec<String>, pub names: Vec<String>,
@@ -236,6 +240,7 @@ impl ProcessFilterConfig {
let mut merged = Self { let mut merged = Self {
include: None, include: None,
filter_mode: included.filter_mode.or(self.filter_mode), filter_mode: included.filter_mode.or(self.filter_mode),
max_processes: included.max_processes,
names: if included.names.is_empty() { names: if included.names.is_empty() {
self.names.clone() self.names.clone()
} else { } else {
@@ -345,3 +350,7 @@ fn default_timeout() -> u64 {
fn default_true() -> bool { fn default_true() -> bool {
true true
} }
fn default_max_processes() -> usize {
10
}

View File

@@ -18,8 +18,8 @@ struct MetricInstruments {
memory_total: opentelemetry::metrics::Gauge<u64>, memory_total: opentelemetry::metrics::Gauge<u64>,
swap_usage: opentelemetry::metrics::Gauge<u64>, swap_usage: opentelemetry::metrics::Gauge<u64>,
swap_total: opentelemetry::metrics::Gauge<u64>, swap_total: opentelemetry::metrics::Gauge<u64>,
network_rx: opentelemetry::metrics::Counter<u64>, network_rx: opentelemetry::metrics::Gauge<u64>,
network_tx: opentelemetry::metrics::Counter<u64>, network_tx: opentelemetry::metrics::Gauge<u64>,
disk_usage: opentelemetry::metrics::Gauge<u64>, disk_usage: opentelemetry::metrics::Gauge<u64>,
disk_total: opentelemetry::metrics::Gauge<u64>, disk_total: opentelemetry::metrics::Gauge<u64>,
process_cpu: opentelemetry::metrics::Gauge<f64>, process_cpu: opentelemetry::metrics::Gauge<f64>,
@@ -87,12 +87,12 @@ impl MetricsExporter {
.with_description("Total swap in bytes") .with_description("Total swap in bytes")
.init(), .init(),
network_rx: meter network_rx: meter
.u64_counter("system_network_rx_bytes_total") .u64_gauge("system_network_rx_bytes_per_sec")
.with_description("Total bytes received") .with_description("Bytes received per second")
.init(), .init(),
network_tx: meter network_tx: meter
.u64_counter("system_network_tx_bytes_total") .u64_gauge("system_network_tx_bytes_per_sec")
.with_description("Total bytes transmitted") .with_description("Bytes transmitted per second")
.init(), .init(),
disk_usage: meter disk_usage: meter
.u64_gauge("system_disk_usage_bytes") .u64_gauge("system_disk_usage_bytes")
@@ -145,8 +145,8 @@ impl MetricsExporter {
if let Some(network_metrics) = &metrics.network { if let Some(network_metrics) = &metrics.network {
for net in network_metrics { for net in network_metrics {
let attrs = &[KeyValue::new("interface", net.interface_name.clone())]; let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
self.gauges.network_rx.add(net.rx_bytes_total, attrs); self.gauges.network_rx.record(net.rx_bytes_per_sec, attrs);
self.gauges.network_tx.add(net.tx_bytes_total, attrs); self.gauges.network_tx.record(net.tx_bytes_per_sec, attrs);
} }
} }

View File

@@ -35,7 +35,7 @@ cpu = true # CPU usage per core
memory = true # RAM and swap usage memory = true # RAM and swap usage
network = true # Network RX/TX network = true # Network RX/TX
disk = true # Disk usage disk = true # Disk usage
processes = false # Top 10 processes (disabled by default - can generate high cardinality) processes = true # Top 10 processes (disabled by default - can generate high cardinality)
temperature = true # System temperatures (if available) temperature = true # System temperatures (if available)
# Process filtering configuration # Process filtering configuration
@@ -46,23 +46,16 @@ temperature = true # System temperatures (if available)
# Option 2: Configure inline # Option 2: Configure inline
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed) # Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
filter_mode = "whitelist" filter_mode = "blacklist"
# Maximum number of processes to report (sorted by CPU usage, default: 10)
max_processes = 10
# List of process names to filter (case-insensitive substring match) # List of process names to filter (case-insensitive substring match)
names = [ names = [
# Web servers # Exclude system processes that generate too much noise
"nginx", # "kworker",
"apache", # "systemd",
# Databases
"postgres",
"mysql",
"redis",
# Application servers
# "java",
# "node",
# "python",
] ]
# List of regex patterns to match process names (case-sensitive) # List of regex patterns to match process names (case-sensitive)