fix network metrics & grafana dashboard
number of top process consumer in config
This commit is contained in:
@@ -66,12 +66,12 @@
|
|||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "system_network_rx_bytes_rate",
|
"expr": "system_network_rx_bytes_per_sec",
|
||||||
"legendFormat": "RX - {{interface}}",
|
"legendFormat": "RX - {{interface}}",
|
||||||
"refId": "RX"
|
"refId": "RX"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "system_network_tx_bytes_rate",
|
"expr": "system_network_tx_bytes_per_sec",
|
||||||
"legendFormat": "TX - {{interface}}",
|
"legendFormat": "TX - {{interface}}",
|
||||||
"refId": "TX"
|
"refId": "TX"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
# Filter mode: "whitelist" or "blacklist"
|
# Filter mode: "whitelist" or "blacklist"
|
||||||
# - whitelist: Only export metrics for processes in the lists below
|
# - whitelist: Only export metrics for processes in the lists below
|
||||||
# - blacklist: Export metrics for all processes EXCEPT those in the lists
|
# - blacklist: Export metrics for all processes EXCEPT those in the lists
|
||||||
filter_mode = "whitelist"
|
filter_mode = "blacklist"
|
||||||
|
|
||||||
# Process names to monitor (case-insensitive substring match)
|
# Process names to monitor (case-insensitive substring match)
|
||||||
# Examples for common server processes:
|
# Examples for common server processes:
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
use crate::config::MetricsConfig;
|
use crate::config::MetricsConfig;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
|
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
|
||||||
|
|
||||||
/// System metrics collected at a point in time
|
/// System metrics collected at a point in time
|
||||||
@@ -30,8 +32,8 @@ pub struct MemoryMetric {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct NetworkMetric {
|
pub struct NetworkMetric {
|
||||||
pub interface_name: String,
|
pub interface_name: String,
|
||||||
pub rx_bytes_total: u64,
|
pub rx_bytes_per_sec: u64,
|
||||||
pub tx_bytes_total: u64,
|
pub tx_bytes_per_sec: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -62,6 +64,9 @@ pub struct MetricsCollector {
|
|||||||
networks: Networks,
|
networks: Networks,
|
||||||
disks: Disks,
|
disks: Disks,
|
||||||
config: MetricsConfig,
|
config: MetricsConfig,
|
||||||
|
// Network rate calculation state
|
||||||
|
last_network_stats: HashMap<String, (u64, u64)>, // interface -> (rx_bytes, tx_bytes)
|
||||||
|
last_network_time: Option<Instant>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricsCollector {
|
impl MetricsCollector {
|
||||||
@@ -76,6 +81,8 @@ impl MetricsCollector {
|
|||||||
networks: Networks::new_with_refreshed_list(),
|
networks: Networks::new_with_refreshed_list(),
|
||||||
disks: Disks::new_with_refreshed_list(),
|
disks: Disks::new_with_refreshed_list(),
|
||||||
config,
|
config,
|
||||||
|
last_network_stats: HashMap::new(),
|
||||||
|
last_network_time: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,15 +147,62 @@ impl MetricsCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_network(&self) -> Vec<NetworkMetric> {
|
fn collect_network(&mut self) -> Vec<NetworkMetric> {
|
||||||
self.networks
|
let now = Instant::now();
|
||||||
.iter()
|
let mut metrics = Vec::new();
|
||||||
.map(|(interface_name, data)| NetworkMetric {
|
|
||||||
interface_name: interface_name.to_string(),
|
// Calculate time delta
|
||||||
rx_bytes_total: data.total_received(),
|
let time_delta_secs = if let Some(last_time) = self.last_network_time {
|
||||||
tx_bytes_total: data.total_transmitted(),
|
now.duration_since(last_time).as_secs_f64()
|
||||||
})
|
} else {
|
||||||
.collect()
|
// First collection, no rate to calculate
|
||||||
|
self.last_network_time = Some(now);
|
||||||
|
for (interface_name, data) in self.networks.iter() {
|
||||||
|
self.last_network_stats.insert(
|
||||||
|
interface_name.to_string(),
|
||||||
|
(data.total_received(), data.total_transmitted()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return metrics; // Return empty on first run
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update timestamp
|
||||||
|
self.last_network_time = Some(now);
|
||||||
|
|
||||||
|
// Calculate rates for each interface
|
||||||
|
for (interface_name, data) in self.networks.iter() {
|
||||||
|
let rx_total = data.total_received();
|
||||||
|
let tx_total = data.total_transmitted();
|
||||||
|
|
||||||
|
if let Some(&(last_rx, last_tx)) = self.last_network_stats.get(interface_name.as_str()) {
|
||||||
|
// Calculate bytes per second
|
||||||
|
let rx_bytes_per_sec = if rx_total >= last_rx {
|
||||||
|
((rx_total - last_rx) as f64 / time_delta_secs) as u64
|
||||||
|
} else {
|
||||||
|
0 // Counter wrapped or interface reset
|
||||||
|
};
|
||||||
|
|
||||||
|
let tx_bytes_per_sec = if tx_total >= last_tx {
|
||||||
|
((tx_total - last_tx) as f64 / time_delta_secs) as u64
|
||||||
|
} else {
|
||||||
|
0 // Counter wrapped or interface reset
|
||||||
|
};
|
||||||
|
|
||||||
|
metrics.push(NetworkMetric {
|
||||||
|
interface_name: interface_name.to_string(),
|
||||||
|
rx_bytes_per_sec,
|
||||||
|
tx_bytes_per_sec,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update last stats
|
||||||
|
self.last_network_stats.insert(
|
||||||
|
interface_name.to_string(),
|
||||||
|
(rx_total, tx_total),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_disk(&self) -> Vec<DiskMetric> {
|
fn collect_disk(&self) -> Vec<DiskMetric> {
|
||||||
@@ -194,13 +248,17 @@ impl MetricsCollector {
|
|||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Sort by CPU usage and limit to top 10
|
// Sort by CPU usage and limit to top N (configurable)
|
||||||
processes.sort_by(|a, b| {
|
processes.sort_by(|a, b| {
|
||||||
b.cpu_usage_percent
|
b.cpu_usage_percent
|
||||||
.partial_cmp(&a.cpu_usage_percent)
|
.partial_cmp(&a.cpu_usage_percent)
|
||||||
.unwrap_or(std::cmp::Ordering::Equal)
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
});
|
});
|
||||||
processes.truncate(10);
|
|
||||||
|
let max_processes = filter
|
||||||
|
.map(|f| f.max_processes)
|
||||||
|
.unwrap_or(10);
|
||||||
|
processes.truncate(max_processes);
|
||||||
|
|
||||||
processes
|
processes
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -195,6 +195,10 @@ pub struct ProcessFilterConfig {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub filter_mode: Option<ProcessFilterMode>,
|
pub filter_mode: Option<ProcessFilterMode>,
|
||||||
|
|
||||||
|
/// Maximum number of processes to report (top N by CPU usage)
|
||||||
|
#[serde(default = "default_max_processes")]
|
||||||
|
pub max_processes: usize,
|
||||||
|
|
||||||
/// List of process names to filter (case-insensitive substring match)
|
/// List of process names to filter (case-insensitive substring match)
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub names: Vec<String>,
|
pub names: Vec<String>,
|
||||||
@@ -236,6 +240,7 @@ impl ProcessFilterConfig {
|
|||||||
let mut merged = Self {
|
let mut merged = Self {
|
||||||
include: None,
|
include: None,
|
||||||
filter_mode: included.filter_mode.or(self.filter_mode),
|
filter_mode: included.filter_mode.or(self.filter_mode),
|
||||||
|
max_processes: included.max_processes,
|
||||||
names: if included.names.is_empty() {
|
names: if included.names.is_empty() {
|
||||||
self.names.clone()
|
self.names.clone()
|
||||||
} else {
|
} else {
|
||||||
@@ -345,3 +350,7 @@ fn default_timeout() -> u64 {
|
|||||||
fn default_true() -> bool {
|
fn default_true() -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_max_processes() -> usize {
|
||||||
|
10
|
||||||
|
}
|
||||||
|
|||||||
@@ -18,8 +18,8 @@ struct MetricInstruments {
|
|||||||
memory_total: opentelemetry::metrics::Gauge<u64>,
|
memory_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
swap_usage: opentelemetry::metrics::Gauge<u64>,
|
swap_usage: opentelemetry::metrics::Gauge<u64>,
|
||||||
swap_total: opentelemetry::metrics::Gauge<u64>,
|
swap_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
network_rx: opentelemetry::metrics::Counter<u64>,
|
network_rx: opentelemetry::metrics::Gauge<u64>,
|
||||||
network_tx: opentelemetry::metrics::Counter<u64>,
|
network_tx: opentelemetry::metrics::Gauge<u64>,
|
||||||
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
||||||
disk_total: opentelemetry::metrics::Gauge<u64>,
|
disk_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
||||||
@@ -87,12 +87,12 @@ impl MetricsExporter {
|
|||||||
.with_description("Total swap in bytes")
|
.with_description("Total swap in bytes")
|
||||||
.init(),
|
.init(),
|
||||||
network_rx: meter
|
network_rx: meter
|
||||||
.u64_counter("system_network_rx_bytes_total")
|
.u64_gauge("system_network_rx_bytes_per_sec")
|
||||||
.with_description("Total bytes received")
|
.with_description("Bytes received per second")
|
||||||
.init(),
|
.init(),
|
||||||
network_tx: meter
|
network_tx: meter
|
||||||
.u64_counter("system_network_tx_bytes_total")
|
.u64_gauge("system_network_tx_bytes_per_sec")
|
||||||
.with_description("Total bytes transmitted")
|
.with_description("Bytes transmitted per second")
|
||||||
.init(),
|
.init(),
|
||||||
disk_usage: meter
|
disk_usage: meter
|
||||||
.u64_gauge("system_disk_usage_bytes")
|
.u64_gauge("system_disk_usage_bytes")
|
||||||
@@ -145,8 +145,8 @@ impl MetricsExporter {
|
|||||||
if let Some(network_metrics) = &metrics.network {
|
if let Some(network_metrics) = &metrics.network {
|
||||||
for net in network_metrics {
|
for net in network_metrics {
|
||||||
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
||||||
self.gauges.network_rx.add(net.rx_bytes_total, attrs);
|
self.gauges.network_rx.record(net.rx_bytes_per_sec, attrs);
|
||||||
self.gauges.network_tx.add(net.tx_bytes_total, attrs);
|
self.gauges.network_tx.record(net.tx_bytes_per_sec, attrs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
23
symon.toml
23
symon.toml
@@ -35,7 +35,7 @@ cpu = true # CPU usage per core
|
|||||||
memory = true # RAM and swap usage
|
memory = true # RAM and swap usage
|
||||||
network = true # Network RX/TX
|
network = true # Network RX/TX
|
||||||
disk = true # Disk usage
|
disk = true # Disk usage
|
||||||
processes = false # Top 10 processes (disabled by default - can generate high cardinality)
|
processes = true # Top 10 processes (disabled by default - can generate high cardinality)
|
||||||
temperature = true # System temperatures (if available)
|
temperature = true # System temperatures (if available)
|
||||||
|
|
||||||
# Process filtering configuration
|
# Process filtering configuration
|
||||||
@@ -46,23 +46,16 @@ temperature = true # System temperatures (if available)
|
|||||||
|
|
||||||
# Option 2: Configure inline
|
# Option 2: Configure inline
|
||||||
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
||||||
filter_mode = "whitelist"
|
filter_mode = "blacklist"
|
||||||
|
|
||||||
|
# Maximum number of processes to report (sorted by CPU usage, default: 10)
|
||||||
|
max_processes = 10
|
||||||
|
|
||||||
# List of process names to filter (case-insensitive substring match)
|
# List of process names to filter (case-insensitive substring match)
|
||||||
names = [
|
names = [
|
||||||
# Web servers
|
# Exclude system processes that generate too much noise
|
||||||
"nginx",
|
# "kworker",
|
||||||
"apache",
|
# "systemd",
|
||||||
|
|
||||||
# Databases
|
|
||||||
"postgres",
|
|
||||||
"mysql",
|
|
||||||
"redis",
|
|
||||||
|
|
||||||
# Application servers
|
|
||||||
# "java",
|
|
||||||
# "node",
|
|
||||||
# "python",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# List of regex patterns to match process names (case-sensitive)
|
# List of regex patterns to match process names (case-sensitive)
|
||||||
|
|||||||
Reference in New Issue
Block a user