cleaned bottom references
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# Bottom OpenTelemetry Metrics Reference
|
||||
# Symon OpenTelemetry Metrics Reference
|
||||
|
||||
This document lists all metrics exported by Bottom when running with the `opentelemetry` feature enabled.
|
||||
This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled.
|
||||
|
||||
## System Metrics
|
||||
|
||||
@@ -106,7 +106,7 @@ sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
|
||||
|
||||
## Recording Rules
|
||||
|
||||
The following recording rules are pre-configured in Prometheus (see `rules/bottom_rules.yml`):
|
||||
The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`):
|
||||
|
||||
| Rule Name | Expression | Description |
|
||||
|-----------|------------|-------------|
|
||||
@@ -166,7 +166,7 @@ system_network_rx_bytes_rate > 10485760
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: bottom_alerts
|
||||
- name: Symon_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
@@ -208,8 +208,8 @@ groups:
|
||||
| `sensor` | Temperature | Temperature sensor name |
|
||||
| `name` | Process metrics | Process name |
|
||||
| `pid` | Process metrics | Process ID |
|
||||
| `exported_job` | All | Always "bottom-system-monitor" |
|
||||
| `otel_scope_name` | All | Always "bottom-system-monitor" |
|
||||
| `exported_job` | All | Always "Symon-system-monitor" |
|
||||
| `otel_scope_name` | All | Always "Symon-system-monitor" |
|
||||
|
||||
## Data Retention
|
||||
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
# Bottom OpenTelemetry Docker Compose Setup
|
||||
# Symon OpenTelemetry Docker Compose Setup
|
||||
|
||||
This directory contains a Docker Compose setup for running an observability stack to monitor Bottom with OpenTelemetry.
|
||||
This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry.
|
||||
|
||||
## Architecture
|
||||
|
||||
The stack includes:
|
||||
|
||||
1. **OpenTelemetry Collector** - Receives metrics from Bottom via OTLP protocol
|
||||
1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol
|
||||
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
||||
3. **Grafana** - Visualizes metrics from Prometheus
|
||||
|
||||
```
|
||||
Bottom (with --headless flag)
|
||||
Symon (with --headless flag)
|
||||
↓ (OTLP/gRPC on port 4317)
|
||||
OpenTelemetry Collector
|
||||
↓ (Prometheus scrape on port 8889)
|
||||
@@ -34,7 +34,7 @@ This will start:
|
||||
- Prometheus on port 9090
|
||||
- Grafana on port 3000
|
||||
|
||||
### 2. Build Bottom with OpenTelemetry support
|
||||
### 2. Build Symon with OpenTelemetry support
|
||||
|
||||
```bash
|
||||
cd ..
|
||||
@@ -43,13 +43,13 @@ cargo build --release --features opentelemetry
|
||||
|
||||
### 3. Create a configuration file
|
||||
|
||||
Create a `bottom-config.toml` file:
|
||||
Create a `Symon-config.toml` file:
|
||||
|
||||
```toml
|
||||
[opentelemetry]
|
||||
enabled = true
|
||||
endpoint = "http://localhost:4317"
|
||||
service_name = "bottom-system-monitor"
|
||||
service_name = "Symon-system-monitor"
|
||||
export_interval_ms = 5000
|
||||
|
||||
[opentelemetry.metrics]
|
||||
@@ -62,10 +62,10 @@ temperature = true
|
||||
gpu = true
|
||||
```
|
||||
|
||||
### 4. Run Bottom in headless mode
|
||||
### 4. Run Symon in headless mode
|
||||
|
||||
```bash
|
||||
./target/release/btm --config bottom-config.toml --headless
|
||||
./target/release/btm --config Symon-config.toml --headless
|
||||
```
|
||||
|
||||
Or without config file:
|
||||
@@ -93,11 +93,11 @@ Configures the OpenTelemetry Collector to:
|
||||
|
||||
Configures Prometheus to:
|
||||
- Scrape metrics from the OTEL Collector every 10 seconds
|
||||
- Load alerting rules from `rules/bottom_rules.yml`
|
||||
- Load alerting rules from `rules/Symon_rules.yml`
|
||||
|
||||
### rules/bottom_rules.yml
|
||||
### rules/Symon_rules.yml
|
||||
|
||||
Contains Prometheus recording rules for Bottom metrics, including:
|
||||
Contains Prometheus recording rules for Symon metrics, including:
|
||||
- Recent process CPU usage metrics
|
||||
- Recent process memory usage metrics
|
||||
|
||||
@@ -132,11 +132,11 @@ topk(10, system_process_memory_usage_bytes)
|
||||
|
||||
Grafana is automatically configured with:
|
||||
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
||||
- **Bottom System Overview dashboard** - pre-loaded
|
||||
- **Symon System Overview dashboard** - pre-loaded
|
||||
|
||||
To access:
|
||||
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
||||
2. Navigate to Dashboards → Browse → "Bottom System Overview"
|
||||
2. Navigate to Dashboards → Browse → "Symon System Overview"
|
||||
|
||||
The dashboard includes:
|
||||
- CPU usage by core
|
||||
@@ -160,7 +160,7 @@ docker-compose down -v
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Bottom not sending metrics
|
||||
### Symon not sending metrics
|
||||
|
||||
Check the OTEL Collector logs:
|
||||
```bash
|
||||
@@ -178,7 +178,7 @@ You should see messages about receiving metrics.
|
||||
|
||||
1. Verify Prometheus data source is configured correctly
|
||||
2. Check that Prometheus has data by querying directly
|
||||
3. Ensure your time range in Grafana includes when Bottom was running
|
||||
3. Ensure your time range in Grafana includes when Symon was running
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
@@ -188,7 +188,7 @@ A TimescaleDB configuration file is available as `docker-compose-timescale.yml.k
|
||||
|
||||
### Custom Prometheus Rules
|
||||
|
||||
Edit `rules/bottom_rules.yml` to add custom recording or alerting rules.
|
||||
Edit `rules/Symon_rules.yml` to add custom recording or alerting rules.
|
||||
|
||||
### OTEL Collector Sampling
|
||||
|
||||
|
||||
@@ -1,61 +0,0 @@
|
||||
services:
|
||||
timescaledb:
|
||||
image: timescale/timescaledb-ha:pg15
|
||||
environment:
|
||||
POSTGRES_PASSWORD: password
|
||||
POSTGRES_DB: promscale
|
||||
POSTGRES_USER: postgres
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- timescale_data:/var/lib/postgresql/data
|
||||
|
||||
promscale:
|
||||
image: timescale/promscale:latest
|
||||
ports:
|
||||
- "9201:9201"
|
||||
depends_on:
|
||||
- timescaledb
|
||||
environment:
|
||||
PROMSCALE_DB_URI: postgres://postgres:password@timescaledb:5432/promscale?sslmode=disable
|
||||
PROMSCALE_STARTUP_INSTALL_EXTENSIONS: "true"
|
||||
restart: on-failure
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
container_name: otel-collector
|
||||
command: ["--config=/etc/otel-collector-config.yml"]
|
||||
volumes:
|
||||
- ./otel-collector-config.yml:/etc/otel-collector-config.yml
|
||||
|
||||
ports:
|
||||
- "4317:4317"
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./rules:/etc/prometheus/rules
|
||||
ports:
|
||||
- "9090:9090" # Interfaccia Web di Prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
depends_on:
|
||||
- otel-collector
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
volumes:
|
||||
- grafana-storage:/var/lib/grafana
|
||||
depends_on:
|
||||
- prometheus
|
||||
|
||||
volumes:
|
||||
grafana-storage:
|
||||
timescale_data:
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"title": "Bottom System Overview",
|
||||
"uid": "bottom-overview",
|
||||
"title": "Symon System Overview",
|
||||
"uid": "syon-overview",
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 16,
|
||||
"refresh": "5s",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Bottom Dashboards'
|
||||
- name: 'Syon Dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
groups:
|
||||
- name: bottom_process_metrics
|
||||
- name: symon_process_metrics
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: system_process_cpu_usage_percent:recent
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
set -e
|
||||
|
||||
echo "🔍 Testing Bottom OpenTelemetry Stack..."
|
||||
echo "🔍 Testing Symon OpenTelemetry Stack..."
|
||||
echo ""
|
||||
|
||||
# Colors
|
||||
@@ -75,6 +75,6 @@ echo " - Grafana: http://localhost:3000 (admin/admin)"
|
||||
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
||||
echo ""
|
||||
echo "💡 Next steps:"
|
||||
echo " 1. Build bottom with: cargo build --release --features opentelemetry"
|
||||
echo " 1. Build Symon with: cargo build --release --features opentelemetry"
|
||||
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
||||
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Example process filter configuration file
|
||||
# This file can be included from the main bottom config to keep
|
||||
# This file can be included from the main symon config to keep
|
||||
# server-specific process lists separate.
|
||||
#
|
||||
# Usage in bottom-config.toml:
|
||||
# Usage in symon-config.toml:
|
||||
# [opentelemetry.metrics.process_filter]
|
||||
# include = "processes.toml"
|
||||
|
||||
237
src/collector.rs
237
src/collector.rs
@@ -2,7 +2,7 @@ use crate::config::MetricsConfig;
|
||||
use anyhow::Result;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Instant;
|
||||
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
|
||||
use sysinfo::{Disks, Networks, ProcessesToUpdate, RefreshKind, System};
|
||||
|
||||
/// System metrics collected at a point in time
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -13,6 +13,8 @@ pub struct SystemMetrics {
|
||||
pub disk: Option<Vec<DiskMetric>>,
|
||||
pub processes: Option<Vec<ProcessMetric>>,
|
||||
pub temperature: Option<Vec<TemperatureMetric>>,
|
||||
pub load_avg: Option<LoadAvgMetric>,
|
||||
pub disk_io: Option<Vec<DiskIoMetric>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -34,6 +36,10 @@ pub struct NetworkMetric {
|
||||
pub interface_name: String,
|
||||
pub rx_bytes_per_sec: u64,
|
||||
pub tx_bytes_per_sec: u64,
|
||||
pub rx_packets_per_sec: u64,
|
||||
pub tx_packets_per_sec: u64,
|
||||
pub rx_errors_per_sec: u64,
|
||||
pub tx_errors_per_sec: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -58,6 +64,22 @@ pub struct TemperatureMetric {
|
||||
pub temperature_celsius: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LoadAvgMetric {
|
||||
pub load1: f64,
|
||||
pub load5: f64,
|
||||
pub load15: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskIoMetric {
|
||||
pub device_name: String,
|
||||
pub read_bytes_per_sec: u64,
|
||||
pub write_bytes_per_sec: u64,
|
||||
pub read_ops_per_sec: u64,
|
||||
pub write_ops_per_sec: u64,
|
||||
}
|
||||
|
||||
/// Collector for system metrics
|
||||
pub struct MetricsCollector {
|
||||
system: System,
|
||||
@@ -65,16 +87,35 @@ pub struct MetricsCollector {
|
||||
disks: Disks,
|
||||
config: MetricsConfig,
|
||||
// Network rate calculation state
|
||||
last_network_stats: HashMap<String, (u64, u64)>, // interface -> (rx_bytes, tx_bytes)
|
||||
last_network_stats: HashMap<String, NetworkStats>,
|
||||
last_network_time: Option<Instant>,
|
||||
// Disk I/O rate calculation state
|
||||
last_disk_io_stats: HashMap<String, DiskIoStats>,
|
||||
last_disk_io_time: Option<Instant>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct NetworkStats {
|
||||
rx_bytes: u64,
|
||||
tx_bytes: u64,
|
||||
rx_packets: u64,
|
||||
tx_packets: u64,
|
||||
rx_errors: u64,
|
||||
tx_errors: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct DiskIoStats {
|
||||
read_bytes: u64,
|
||||
write_bytes: u64,
|
||||
read_count: u64,
|
||||
write_count: u64,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new(config: MetricsConfig) -> Self {
|
||||
let refresh_kind = RefreshKind::new()
|
||||
.with_cpu(CpuRefreshKind::everything())
|
||||
.with_memory(sysinfo::MemoryRefreshKind::everything())
|
||||
.with_processes(sysinfo::ProcessRefreshKind::everything());
|
||||
// Initialize with minimal data - we'll refresh on-demand
|
||||
let refresh_kind = RefreshKind::new();
|
||||
|
||||
Self {
|
||||
system: System::new_with_specifics(refresh_kind),
|
||||
@@ -83,14 +124,35 @@ impl MetricsCollector {
|
||||
config,
|
||||
last_network_stats: HashMap::new(),
|
||||
last_network_time: None,
|
||||
last_disk_io_stats: HashMap::new(),
|
||||
last_disk_io_time: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect all enabled metrics
|
||||
pub fn collect(&mut self) -> Result<SystemMetrics> {
|
||||
// Refresh system info
|
||||
self.system.refresh_all();
|
||||
self.networks.refresh();
|
||||
// Refresh only what's needed based on enabled metrics
|
||||
if self.config.cpu {
|
||||
self.system.refresh_cpu_all();
|
||||
}
|
||||
|
||||
if self.config.memory {
|
||||
self.system.refresh_memory();
|
||||
}
|
||||
|
||||
if self.config.processes {
|
||||
self.system.refresh_processes(ProcessesToUpdate::All);
|
||||
}
|
||||
|
||||
if self.config.network {
|
||||
self.networks.refresh();
|
||||
}
|
||||
|
||||
if self.config.disk {
|
||||
self.disks.refresh();
|
||||
}
|
||||
|
||||
// Note: Temperature metrics are currently not implemented
|
||||
|
||||
Ok(SystemMetrics {
|
||||
cpu: if self.config.cpu {
|
||||
@@ -123,6 +185,16 @@ impl MetricsCollector {
|
||||
} else {
|
||||
None
|
||||
},
|
||||
load_avg: if self.config.load_avg {
|
||||
Some(self.collect_load_avg())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
disk_io: if self.config.disk_io {
|
||||
Some(self.collect_disk_io())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
@@ -160,7 +232,14 @@ impl MetricsCollector {
|
||||
for (interface_name, data) in self.networks.iter() {
|
||||
self.last_network_stats.insert(
|
||||
interface_name.to_string(),
|
||||
(data.total_received(), data.total_transmitted()),
|
||||
NetworkStats {
|
||||
rx_bytes: data.total_received(),
|
||||
tx_bytes: data.total_transmitted(),
|
||||
rx_packets: data.total_packets_received(),
|
||||
tx_packets: data.total_packets_transmitted(),
|
||||
rx_errors: data.total_errors_on_received(),
|
||||
tx_errors: data.total_errors_on_transmitted(),
|
||||
},
|
||||
);
|
||||
}
|
||||
return metrics; // Return empty on first run
|
||||
@@ -171,40 +250,43 @@ impl MetricsCollector {
|
||||
|
||||
// Calculate rates for each interface
|
||||
for (interface_name, data) in self.networks.iter() {
|
||||
let rx_total = data.total_received();
|
||||
let tx_total = data.total_transmitted();
|
||||
|
||||
if let Some(&(last_rx, last_tx)) = self.last_network_stats.get(interface_name.as_str()) {
|
||||
// Calculate bytes per second
|
||||
let rx_bytes_per_sec = if rx_total >= last_rx {
|
||||
((rx_total - last_rx) as f64 / time_delta_secs) as u64
|
||||
} else {
|
||||
0 // Counter wrapped or interface reset
|
||||
};
|
||||
|
||||
let tx_bytes_per_sec = if tx_total >= last_tx {
|
||||
((tx_total - last_tx) as f64 / time_delta_secs) as u64
|
||||
} else {
|
||||
0 // Counter wrapped or interface reset
|
||||
};
|
||||
let current_stats = NetworkStats {
|
||||
rx_bytes: data.total_received(),
|
||||
tx_bytes: data.total_transmitted(),
|
||||
rx_packets: data.total_packets_received(),
|
||||
tx_packets: data.total_packets_transmitted(),
|
||||
rx_errors: data.total_errors_on_received(),
|
||||
tx_errors: data.total_errors_on_transmitted(),
|
||||
};
|
||||
|
||||
if let Some(last_stats) = self.last_network_stats.get(interface_name.as_str()) {
|
||||
// Calculate rates per second
|
||||
metrics.push(NetworkMetric {
|
||||
interface_name: interface_name.to_string(),
|
||||
rx_bytes_per_sec,
|
||||
tx_bytes_per_sec,
|
||||
rx_bytes_per_sec: Self::calculate_rate(current_stats.rx_bytes, last_stats.rx_bytes, time_delta_secs),
|
||||
tx_bytes_per_sec: Self::calculate_rate(current_stats.tx_bytes, last_stats.tx_bytes, time_delta_secs),
|
||||
rx_packets_per_sec: Self::calculate_rate(current_stats.rx_packets, last_stats.rx_packets, time_delta_secs),
|
||||
tx_packets_per_sec: Self::calculate_rate(current_stats.tx_packets, last_stats.tx_packets, time_delta_secs),
|
||||
rx_errors_per_sec: Self::calculate_rate(current_stats.rx_errors, last_stats.rx_errors, time_delta_secs),
|
||||
tx_errors_per_sec: Self::calculate_rate(current_stats.tx_errors, last_stats.tx_errors, time_delta_secs),
|
||||
});
|
||||
}
|
||||
|
||||
// Update last stats
|
||||
self.last_network_stats.insert(
|
||||
interface_name.to_string(),
|
||||
(rx_total, tx_total),
|
||||
);
|
||||
self.last_network_stats.insert(interface_name.to_string(), current_stats);
|
||||
}
|
||||
|
||||
metrics
|
||||
}
|
||||
|
||||
fn calculate_rate(current: u64, last: u64, time_delta: f64) -> u64 {
|
||||
if current >= last {
|
||||
((current - last) as f64 / time_delta) as u64
|
||||
} else {
|
||||
0 // Counter wrapped or interface reset
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_disk(&self) -> Vec<DiskMetric> {
|
||||
self.disks
|
||||
.iter()
|
||||
@@ -225,41 +307,63 @@ impl MetricsCollector {
|
||||
|
||||
fn collect_processes(&self) -> Vec<ProcessMetric> {
|
||||
let filter = self.config.process_filter.as_ref();
|
||||
let max_processes = filter.map(|f| f.max_processes).unwrap_or(10);
|
||||
|
||||
let mut processes: Vec<ProcessMetric> = self
|
||||
.system
|
||||
.processes()
|
||||
.iter()
|
||||
.filter(|(_, process)| {
|
||||
if let Some(filter_config) = filter {
|
||||
filter_config.should_include_process(
|
||||
process.name().to_string_lossy().as_ref(),
|
||||
process.pid().as_u32(),
|
||||
)
|
||||
} else {
|
||||
true
|
||||
// Pre-allocate with expected capacity
|
||||
let mut processes: Vec<ProcessMetric> = Vec::with_capacity(max_processes);
|
||||
|
||||
// Collect only processes that pass the filter
|
||||
for (_, process) in self.system.processes().iter() {
|
||||
// Skip if filter rejects this process
|
||||
if let Some(filter_config) = filter {
|
||||
let process_name = process.name().to_string_lossy();
|
||||
if !filter_config.should_include_process(process_name.as_ref(), process.pid().as_u32()) {
|
||||
continue;
|
||||
}
|
||||
})
|
||||
.map(|(_, process)| ProcessMetric {
|
||||
pid: process.pid().as_u32(),
|
||||
name: process.name().to_string_lossy().to_string(),
|
||||
cpu_usage_percent: process.cpu_usage(),
|
||||
memory_bytes: process.memory(),
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
|
||||
// Sort by CPU usage and limit to top N (configurable)
|
||||
let cpu_usage = process.cpu_usage();
|
||||
|
||||
// If we haven't reached max_processes yet, just add it
|
||||
if processes.len() < max_processes {
|
||||
processes.push(ProcessMetric {
|
||||
pid: process.pid().as_u32(),
|
||||
name: process.name().to_string_lossy().to_string(),
|
||||
cpu_usage_percent: cpu_usage,
|
||||
memory_bytes: process.memory(),
|
||||
});
|
||||
} else {
|
||||
// Find the process with minimum CPU usage in our list
|
||||
if let Some(min_idx) = processes
|
||||
.iter()
|
||||
.enumerate()
|
||||
.min_by(|(_, a), (_, b)| {
|
||||
a.cpu_usage_percent
|
||||
.partial_cmp(&b.cpu_usage_percent)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
})
|
||||
.map(|(idx, _)| idx)
|
||||
{
|
||||
// Replace if current process has higher CPU usage
|
||||
if cpu_usage > processes[min_idx].cpu_usage_percent {
|
||||
processes[min_idx] = ProcessMetric {
|
||||
pid: process.pid().as_u32(),
|
||||
name: process.name().to_string_lossy().to_string(),
|
||||
cpu_usage_percent: cpu_usage,
|
||||
memory_bytes: process.memory(),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final sort by CPU usage (descending)
|
||||
processes.sort_by(|a, b| {
|
||||
b.cpu_usage_percent
|
||||
.partial_cmp(&a.cpu_usage_percent)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
|
||||
let max_processes = filter
|
||||
.map(|f| f.max_processes)
|
||||
.unwrap_or(10);
|
||||
processes.truncate(max_processes);
|
||||
|
||||
processes
|
||||
}
|
||||
|
||||
@@ -269,4 +373,21 @@ impl MetricsCollector {
|
||||
// For now, return empty vector
|
||||
vec![]
|
||||
}
|
||||
|
||||
fn collect_load_avg(&self) -> LoadAvgMetric {
|
||||
let load_avg = System::load_average();
|
||||
LoadAvgMetric {
|
||||
load1: load_avg.one,
|
||||
load5: load_avg.five,
|
||||
load15: load_avg.fifteen,
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_disk_io(&mut self) -> Vec<DiskIoMetric> {
|
||||
// Note: sysinfo 0.31 doesn't provide disk I/O stats directly
|
||||
// This would require reading /proc/diskstats on Linux or using platform-specific APIs
|
||||
// For now, return empty vector
|
||||
// TODO: Implement platform-specific disk I/O collection
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,6 +166,14 @@ pub struct MetricsConfig {
|
||||
#[serde(default = "default_true")]
|
||||
pub temperature: bool,
|
||||
|
||||
/// Export load average metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub load_avg: bool,
|
||||
|
||||
/// Export disk I/O metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub disk_io: bool,
|
||||
|
||||
/// Process filter configuration
|
||||
#[serde(default)]
|
||||
pub process_filter: Option<ProcessFilterConfig>,
|
||||
@@ -180,6 +188,8 @@ impl Default for MetricsConfig {
|
||||
disk: true,
|
||||
processes: false,
|
||||
temperature: true,
|
||||
load_avg: true,
|
||||
disk_io: true,
|
||||
process_filter: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,11 +20,22 @@ struct MetricInstruments {
|
||||
swap_total: opentelemetry::metrics::Gauge<u64>,
|
||||
network_rx: opentelemetry::metrics::Gauge<u64>,
|
||||
network_tx: opentelemetry::metrics::Gauge<u64>,
|
||||
network_rx_packets: opentelemetry::metrics::Gauge<u64>,
|
||||
network_tx_packets: opentelemetry::metrics::Gauge<u64>,
|
||||
network_rx_errors: opentelemetry::metrics::Gauge<u64>,
|
||||
network_tx_errors: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_total: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_io_read_bytes: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_io_write_bytes: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_io_read_ops: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_io_write_ops: opentelemetry::metrics::Gauge<u64>,
|
||||
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
||||
process_memory: opentelemetry::metrics::Gauge<u64>,
|
||||
temperature: opentelemetry::metrics::Gauge<f64>,
|
||||
load_avg_1: opentelemetry::metrics::Gauge<f64>,
|
||||
load_avg_5: opentelemetry::metrics::Gauge<f64>,
|
||||
load_avg_15: opentelemetry::metrics::Gauge<f64>,
|
||||
}
|
||||
|
||||
impl MetricsExporter {
|
||||
@@ -94,6 +105,22 @@ impl MetricsExporter {
|
||||
.u64_gauge("system_network_tx_bytes_per_sec")
|
||||
.with_description("Bytes transmitted per second")
|
||||
.init(),
|
||||
network_rx_packets: meter
|
||||
.u64_gauge("system_network_rx_packets_per_sec")
|
||||
.with_description("Packets received per second")
|
||||
.init(),
|
||||
network_tx_packets: meter
|
||||
.u64_gauge("system_network_tx_packets_per_sec")
|
||||
.with_description("Packets transmitted per second")
|
||||
.init(),
|
||||
network_rx_errors: meter
|
||||
.u64_gauge("system_network_rx_errors_per_sec")
|
||||
.with_description("Receive errors per second")
|
||||
.init(),
|
||||
network_tx_errors: meter
|
||||
.u64_gauge("system_network_tx_errors_per_sec")
|
||||
.with_description("Transmit errors per second")
|
||||
.init(),
|
||||
disk_usage: meter
|
||||
.u64_gauge("system_disk_usage_bytes")
|
||||
.with_description("Disk usage in bytes")
|
||||
@@ -114,6 +141,34 @@ impl MetricsExporter {
|
||||
.f64_gauge("system_temperature_celsius")
|
||||
.with_description("Temperature in Celsius")
|
||||
.init(),
|
||||
disk_io_read_bytes: meter
|
||||
.u64_gauge("system_disk_io_read_bytes_per_sec")
|
||||
.with_description("Disk read bytes per second")
|
||||
.init(),
|
||||
disk_io_write_bytes: meter
|
||||
.u64_gauge("system_disk_io_write_bytes_per_sec")
|
||||
.with_description("Disk write bytes per second")
|
||||
.init(),
|
||||
disk_io_read_ops: meter
|
||||
.u64_gauge("system_disk_io_read_ops_per_sec")
|
||||
.with_description("Disk read operations per second")
|
||||
.init(),
|
||||
disk_io_write_ops: meter
|
||||
.u64_gauge("system_disk_io_write_ops_per_sec")
|
||||
.with_description("Disk write operations per second")
|
||||
.init(),
|
||||
load_avg_1: meter
|
||||
.f64_gauge("system_load_average_1m")
|
||||
.with_description("System load average over 1 minute")
|
||||
.init(),
|
||||
load_avg_5: meter
|
||||
.f64_gauge("system_load_average_5m")
|
||||
.with_description("System load average over 5 minutes")
|
||||
.init(),
|
||||
load_avg_15: meter
|
||||
.f64_gauge("system_load_average_15m")
|
||||
.with_description("System load average over 15 minutes")
|
||||
.init(),
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
@@ -147,6 +202,10 @@ impl MetricsExporter {
|
||||
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
||||
self.gauges.network_rx.record(net.rx_bytes_per_sec, attrs);
|
||||
self.gauges.network_tx.record(net.tx_bytes_per_sec, attrs);
|
||||
self.gauges.network_rx_packets.record(net.rx_packets_per_sec, attrs);
|
||||
self.gauges.network_tx_packets.record(net.tx_packets_per_sec, attrs);
|
||||
self.gauges.network_rx_errors.record(net.rx_errors_per_sec, attrs);
|
||||
self.gauges.network_tx_errors.record(net.tx_errors_per_sec, attrs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,6 +244,24 @@ impl MetricsExporter {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Export load average metrics
|
||||
if let Some(load_avg) = &metrics.load_avg {
|
||||
self.gauges.load_avg_1.record(load_avg.load1, &[]);
|
||||
self.gauges.load_avg_5.record(load_avg.load5, &[]);
|
||||
self.gauges.load_avg_15.record(load_avg.load15, &[]);
|
||||
}
|
||||
|
||||
// Export disk I/O metrics
|
||||
if let Some(disk_io_metrics) = &metrics.disk_io {
|
||||
for disk_io in disk_io_metrics {
|
||||
let attrs = &[KeyValue::new("device", disk_io.device_name.clone())];
|
||||
self.gauges.disk_io_read_bytes.record(disk_io.read_bytes_per_sec, attrs);
|
||||
self.gauges.disk_io_write_bytes.record(disk_io.write_bytes_per_sec, attrs);
|
||||
self.gauges.disk_io_read_ops.record(disk_io.read_ops_per_sec, attrs);
|
||||
self.gauges.disk_io_write_ops.record(disk_io.write_ops_per_sec, attrs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn shutdown(self) -> Result<()> {
|
||||
|
||||
@@ -33,10 +33,12 @@ environment = "production"
|
||||
[metrics]
|
||||
cpu = true # CPU usage per core
|
||||
memory = true # RAM and swap usage
|
||||
network = true # Network RX/TX
|
||||
network = true # Network RX/TX bytes, packets, errors
|
||||
disk = true # Disk usage
|
||||
processes = true # Top 10 processes (disabled by default - can generate high cardinality)
|
||||
processes = true # Top N processes by CPU (disabled by default - can generate high cardinality)
|
||||
temperature = true # System temperatures (if available)
|
||||
load_avg = true # System load average (1m, 5m, 15m)
|
||||
disk_io = true # Disk I/O read/write bytes and operations (Linux only)
|
||||
|
||||
# Process filtering configuration
|
||||
# Only used when processes = true
|
||||
@@ -49,7 +51,7 @@ temperature = true # System temperatures (if available)
|
||||
filter_mode = "blacklist"
|
||||
|
||||
# Maximum number of processes to report (sorted by CPU usage, default: 10)
|
||||
max_processes = 10
|
||||
max_processes = 5
|
||||
|
||||
# List of process names to filter (case-insensitive substring match)
|
||||
names = [
|
||||
|
||||
Reference in New Issue
Block a user