Compare commits
5 Commits
9bb3f113db
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 869427c24d | |||
| 4c2a5b6738 | |||
| 9dba844d13 | |||
| 5cb3395694 | |||
| 2e950506b7 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -19,3 +19,5 @@ Thumbs.db
|
|||||||
|
|
||||||
# Logs
|
# Logs
|
||||||
*.log
|
*.log
|
||||||
|
|
||||||
|
loki-data/
|
||||||
1142
Cargo.lock
generated
1142
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
16
Cargo.toml
16
Cargo.toml
@@ -9,27 +9,27 @@ repository = "https://github.com/battilo/symon"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
# OpenTelemetry
|
# OpenTelemetry
|
||||||
opentelemetry = { version = "0.26", features = ["metrics"] }
|
opentelemetry = { version = "0.31", features = ["metrics"] }
|
||||||
opentelemetry-otlp = { version = "0.26", features = ["metrics", "grpc-tonic"] }
|
opentelemetry-otlp = { version = "0.31", features = ["metrics", "grpc-tonic"] }
|
||||||
opentelemetry_sdk = { version = "0.26", features = ["metrics", "rt-tokio"] }
|
opentelemetry_sdk = { version = "0.31", features = ["metrics", "rt-tokio"] }
|
||||||
opentelemetry-semantic-conventions = "0.26"
|
opentelemetry-semantic-conventions = "0.31"
|
||||||
|
|
||||||
# Async runtime
|
# Async runtime
|
||||||
tokio = { version = "1.48", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
|
tokio = { version = "1.48", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
|
||||||
tonic = "0.11"
|
tonic = "0.14.2"
|
||||||
|
|
||||||
# System metrics collection
|
# System metrics collection
|
||||||
sysinfo = "0.31"
|
sysinfo = "0.37.2"
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
toml = "0.8"
|
toml = "0.9.8"
|
||||||
|
|
||||||
# Logging and error handling
|
# Logging and error handling
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
anyhow = "1.0"
|
anyhow = "1.0"
|
||||||
thiserror = "1.0"
|
thiserror = "2.0.17"
|
||||||
|
|
||||||
# Process filtering
|
# Process filtering
|
||||||
regex = "1.11"
|
regex = "1.11"
|
||||||
|
|||||||
227
docker-compose-alloy/METRICS.md
Normal file
227
docker-compose-alloy/METRICS.md
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
# Symon OpenTelemetry Metrics Reference
|
||||||
|
|
||||||
|
This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled.
|
||||||
|
|
||||||
|
## System Metrics
|
||||||
|
|
||||||
|
### CPU
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_cpu_usage_percent` | Gauge | `cpu_id` | CPU usage percentage per core |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Average CPU across all cores
|
||||||
|
avg(system_cpu_usage_percent)
|
||||||
|
|
||||||
|
# CPU usage for core 0
|
||||||
|
system_cpu_usage_percent{cpu_id="0"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Memory
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_memory_usage_bytes` | Gauge | - | RAM memory currently in use |
|
||||||
|
| `system_memory_total_bytes` | Gauge | - | Total RAM memory available |
|
||||||
|
| `system_swap_usage_bytes` | Gauge | - | Swap memory currently in use |
|
||||||
|
| `system_swap_total_bytes` | Gauge | - | Total swap memory available |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Memory usage percentage
|
||||||
|
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||||
|
|
||||||
|
# Available memory
|
||||||
|
system_memory_total_bytes - system_memory_usage_bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_network_rx_bytes_rate` | Gauge | `interface` | Network receive rate in bytes/sec |
|
||||||
|
| `system_network_tx_bytes_rate` | Gauge | `interface` | Network transmit rate in bytes/sec |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Total network throughput
|
||||||
|
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||||
|
|
||||||
|
# RX rate for specific interface
|
||||||
|
system_network_rx_bytes_rate{interface="eth0"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Disk
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_disk_usage_bytes` | Gauge | `device`, `mount` | Disk space currently in use |
|
||||||
|
| `system_disk_total_bytes` | Gauge | `device`, `mount` | Total disk space available |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Disk usage percentage
|
||||||
|
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||||
|
|
||||||
|
# Free disk space
|
||||||
|
system_disk_total_bytes - system_disk_usage_bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Temperature
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_temperature_celsius` | Gauge | `sensor` | Temperature readings in Celsius |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Average temperature across all sensors
|
||||||
|
avg(system_temperature_celsius)
|
||||||
|
|
||||||
|
# Maximum temperature
|
||||||
|
max(system_temperature_celsius)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Process Metrics
|
||||||
|
|
||||||
|
| Metric Name | Type | Labels | Description |
|
||||||
|
|------------|------|--------|-------------|
|
||||||
|
| `system_process_cpu_usage_percent` | Gauge | `name`, `pid` | CPU usage percentage per process |
|
||||||
|
| `system_process_memory_usage_bytes` | Gauge | `name`, `pid` | Memory usage in bytes per process |
|
||||||
|
| `system_process_count` | Gauge | - | Total number of processes |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Top 10 processes by CPU
|
||||||
|
topk(10, system_process_cpu_usage_percent)
|
||||||
|
|
||||||
|
# Top 10 processes by memory
|
||||||
|
topk(10, system_process_memory_usage_bytes)
|
||||||
|
|
||||||
|
# Total memory used by all Chrome processes
|
||||||
|
sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
|
||||||
|
```
|
||||||
|
|
||||||
|
## Recording Rules
|
||||||
|
|
||||||
|
The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`):
|
||||||
|
|
||||||
|
| Rule Name | Expression | Description |
|
||||||
|
|-----------|------------|-------------|
|
||||||
|
| `system_process_cpu_usage_percent:recent` | Recent process CPU metrics | Filters out stale process data (>2 min old) |
|
||||||
|
| `system_process_memory_usage_bytes:recent` | Recent process memory metrics | Filters out stale process data (>2 min old) |
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```promql
|
||||||
|
# Query only recent process data
|
||||||
|
topk(10, system_process_cpu_usage_percent:recent)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Queries
|
||||||
|
|
||||||
|
### System Health
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Overall system CPU usage
|
||||||
|
avg(system_cpu_usage_percent)
|
||||||
|
|
||||||
|
# Memory pressure (>80% is high)
|
||||||
|
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||||
|
|
||||||
|
# Disk pressure (>90% is critical)
|
||||||
|
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||||
|
```
|
||||||
|
|
||||||
|
### Resource Hogs
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Top CPU consumers
|
||||||
|
topk(5, system_process_cpu_usage_percent)
|
||||||
|
|
||||||
|
# Top memory consumers
|
||||||
|
topk(5, system_process_memory_usage_bytes)
|
||||||
|
|
||||||
|
# Processes using >1GB memory
|
||||||
|
system_process_memory_usage_bytes > 1073741824
|
||||||
|
```
|
||||||
|
|
||||||
|
### Network Analysis
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# Total network traffic (RX + TX)
|
||||||
|
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||||
|
|
||||||
|
# Network traffic by interface
|
||||||
|
sum by (interface) (system_network_rx_bytes_rate + system_network_tx_bytes_rate)
|
||||||
|
|
||||||
|
# Interfaces with high RX rate (>10MB/s)
|
||||||
|
system_network_rx_bytes_rate > 10485760
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alerting Examples
|
||||||
|
|
||||||
|
### Sample Prometheus Alert Rules
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
groups:
|
||||||
|
- name: Symon_alerts
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
- alert: HighCPUUsage
|
||||||
|
expr: avg(system_cpu_usage_percent) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High CPU usage detected"
|
||||||
|
description: "Average CPU usage is {{ $value }}%"
|
||||||
|
|
||||||
|
- alert: HighMemoryUsage
|
||||||
|
expr: (system_memory_usage_bytes / system_memory_total_bytes) * 100 > 90
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "High memory usage detected"
|
||||||
|
description: "Memory usage is {{ $value }}%"
|
||||||
|
|
||||||
|
- alert: DiskAlmostFull
|
||||||
|
expr: (system_disk_usage_bytes / system_disk_total_bytes) * 100 > 90
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Disk {{ $labels.mount }} almost full"
|
||||||
|
description: "Disk usage is {{ $value }}% on {{ $labels.mount }}"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Label Reference
|
||||||
|
|
||||||
|
| Label | Used In | Description |
|
||||||
|
|-------|---------|-------------|
|
||||||
|
| `cpu_id` | CPU metrics | CPU core identifier (0, 1, 2, ...) |
|
||||||
|
| `interface` | Network metrics | Network interface name (eth0, wlan0, ...) |
|
||||||
|
| `device` | Disk metrics | Device name (/dev/sda1, ...) |
|
||||||
|
| `mount` | Disk metrics | Mount point (/, /home, ...) |
|
||||||
|
| `sensor` | Temperature | Temperature sensor name |
|
||||||
|
| `name` | Process metrics | Process name |
|
||||||
|
| `pid` | Process metrics | Process ID |
|
||||||
|
| `exported_job` | All | Always "Symon-system-monitor" |
|
||||||
|
| `otel_scope_name` | All | Always "Symon-system-monitor" |
|
||||||
|
|
||||||
|
## Data Retention
|
||||||
|
|
||||||
|
By default, Prometheus stores metrics for 15 days. You can adjust this in the Prometheus configuration:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# In prometheus.yml
|
||||||
|
global:
|
||||||
|
retention_time: 30d # Keep data for 30 days
|
||||||
|
```
|
||||||
|
|
||||||
|
For long-term storage, consider using:
|
||||||
|
- **TimescaleDB** (see `docker-compose-timescale.yml.ko`)
|
||||||
|
- **Thanos** for multi-cluster metrics
|
||||||
|
- **Cortex** for horizontally scalable storage
|
||||||
148
docker-compose-alloy/README-ALLOY.md
Normal file
148
docker-compose-alloy/README-ALLOY.md
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
# Stack Observability con Grafana Alloy
|
||||||
|
|
||||||
|
Questa directory contiene uno stack di observability **semplificato** che usa **Grafana Alloy** al posto di tre componenti separati.
|
||||||
|
|
||||||
|
## Differenze con docker-compose originale
|
||||||
|
|
||||||
|
### Setup Originale (7 servizi)
|
||||||
|
```
|
||||||
|
- Promtail → Raccolta log dai container
|
||||||
|
- Node Exporter → Metriche sistema host
|
||||||
|
- OTEL Collector → Ricezione telemetria OTLP
|
||||||
|
- Prometheus → Storage metriche
|
||||||
|
- Loki → Storage log
|
||||||
|
- Tempo → Storage traces
|
||||||
|
- Grafana → Visualizzazione
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setup con Alloy (5 servizi) ⭐
|
||||||
|
```
|
||||||
|
- Alloy → Raccolta log + metriche host + OTLP (tutto in uno!)
|
||||||
|
- Prometheus → Storage metriche
|
||||||
|
- Loki → Storage log
|
||||||
|
- Tempo → Storage traces
|
||||||
|
- Grafana → Visualizzazione
|
||||||
|
+ Pyroscope → Continuous profiling
|
||||||
|
```
|
||||||
|
|
||||||
|
## Cosa fa Alloy
|
||||||
|
|
||||||
|
Grafana Alloy sostituisce **3 servizi** in uno:
|
||||||
|
|
||||||
|
| Funzione | Prima | Dopo |
|
||||||
|
|----------|-------|------|
|
||||||
|
| **Raccolta log Docker** | Promtail | Alloy |
|
||||||
|
| **Metriche sistema host** | Node Exporter | Alloy (unix exporter integrato) |
|
||||||
|
| **Ricezione OTLP** | OTEL Collector | Alloy (otelcol receiver) |
|
||||||
|
|
||||||
|
### Vantaggi di Alloy:
|
||||||
|
- ✅ **Meno container** da gestire (5 invece di 7)
|
||||||
|
- ✅ **Configurazione unificata** in un solo file (alloy-config.alloy)
|
||||||
|
- ✅ **Meno risorse** consumate (CPU/RAM)
|
||||||
|
- ✅ **UI integrata** per debugging (http://localhost:12345)
|
||||||
|
- ✅ **Più moderno** (linguaggio River invece di YAML)
|
||||||
|
- ✅ **Hot reload** della configurazione
|
||||||
|
|
||||||
|
### Come funziona:
|
||||||
|
|
||||||
|
1. **Logs**: Alloy scopre i container con label `logging=alloy` e invia i log a Loki
|
||||||
|
2. **Metrics**: Alloy raccoglie metriche host e le invia a Prometheus via Remote Write
|
||||||
|
3. **Traces**: Alloy riceve traces OTLP e le inoltra a Tempo
|
||||||
|
|
||||||
|
## Porte esposte
|
||||||
|
|
||||||
|
| Servizio | Porta | Descrizione |
|
||||||
|
|----------|-------|-------------|
|
||||||
|
| Alloy | 12345 | UI e metriche self-monitoring |
|
||||||
|
| Alloy | 4317 | OTLP gRPC (traces/metrics) |
|
||||||
|
| Alloy | 4318 | OTLP HTTP (traces/metrics) |
|
||||||
|
| Prometheus | 9090 | Query e UI |
|
||||||
|
| Loki | 3100 | Push/Query API |
|
||||||
|
| Tempo | 3200 | Query API |
|
||||||
|
| Pyroscope | 4040 | Profiling UI e API |
|
||||||
|
| Grafana | 3000 | Dashboard |
|
||||||
|
|
||||||
|
## Come usare
|
||||||
|
|
||||||
|
### Avviare lo stack:
|
||||||
|
```bash
|
||||||
|
cd docker-compose-alloy
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verificare Alloy:
|
||||||
|
```bash
|
||||||
|
# UI di Alloy (molto utile per debugging!)
|
||||||
|
open http://localhost:12345
|
||||||
|
|
||||||
|
# Vedere i target scoperti
|
||||||
|
curl http://localhost:12345/api/v0/component/discovery.docker.containers/targets
|
||||||
|
|
||||||
|
# Vedere la configurazione caricata
|
||||||
|
curl http://localhost:12345/api/v0/web/components
|
||||||
|
```
|
||||||
|
|
||||||
|
### Accedere ai servizi:
|
||||||
|
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||||
|
- **Alloy UI**: http://localhost:12345
|
||||||
|
- **Prometheus**: http://localhost:9090
|
||||||
|
- **Pyroscope**: http://localhost:4040
|
||||||
|
|
||||||
|
## Configurazione
|
||||||
|
|
||||||
|
### File principale:
|
||||||
|
- **alloy-config.alloy**: Configurazione unificata di Alloy (sostituisce promtail-config.yml e otel-collector-config.yml)
|
||||||
|
- **prometheus.yml**: Configurazione Prometheus (semplificata, Alloy fa remote write)
|
||||||
|
- **loki-config.yml**: Configurazione Loki (invariata)
|
||||||
|
- **tempo-config.yml**: Configurazione Tempo (invariata)
|
||||||
|
|
||||||
|
### Modificare la configurazione di Alloy:
|
||||||
|
|
||||||
|
1. Modifica `alloy-config.alloy`
|
||||||
|
2. Riavvia il container: `docker-compose restart alloy`
|
||||||
|
3. Verifica la configurazione: http://localhost:12345
|
||||||
|
|
||||||
|
## Metriche disponibili
|
||||||
|
|
||||||
|
Alloy espone le stesse metriche di Node Exporter con prefisso `node_*`:
|
||||||
|
```promql
|
||||||
|
# CPU usage
|
||||||
|
rate(node_cpu_seconds_total[5m])
|
||||||
|
|
||||||
|
# Memory
|
||||||
|
node_memory_MemAvailable_bytes
|
||||||
|
|
||||||
|
# Disk
|
||||||
|
node_filesystem_avail_bytes
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Verificare che Alloy stia raccogliendo log:
|
||||||
|
```bash
|
||||||
|
docker-compose logs alloy | grep loki
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verificare che Alloy stia inviando metriche a Prometheus:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:9090/api/v1/label/__name__/values | grep node_
|
||||||
|
```
|
||||||
|
|
||||||
|
### Vedere i componenti attivi in Alloy:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:12345/api/v0/web/components | jq
|
||||||
|
```
|
||||||
|
|
||||||
|
## Migrazione dalla versione originale
|
||||||
|
|
||||||
|
Se hai già la versione con Promtail/Node-Exporter/OTEL:
|
||||||
|
|
||||||
|
1. Ferma lo stack vecchio: `cd ../docker-compose && docker-compose down`
|
||||||
|
2. Avvia il nuovo: `cd ../docker-compose-alloy && docker-compose up -d`
|
||||||
|
3. I dati storici in Prometheus/Loki/Tempo sono preservati nei volumi Docker
|
||||||
|
|
||||||
|
## Link utili
|
||||||
|
|
||||||
|
- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/)
|
||||||
|
- [Alloy Configuration Reference](https://grafana.com/docs/alloy/latest/reference/)
|
||||||
|
- [River Language](https://grafana.com/docs/alloy/latest/concepts/configuration-syntax/)
|
||||||
195
docker-compose-alloy/README.md
Normal file
195
docker-compose-alloy/README.md
Normal file
@@ -0,0 +1,195 @@
|
|||||||
|
# Symon OpenTelemetry Docker Compose Setup
|
||||||
|
|
||||||
|
This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
The stack includes:
|
||||||
|
|
||||||
|
1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol
|
||||||
|
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
||||||
|
3. **Grafana** - Visualizes metrics from Prometheus
|
||||||
|
|
||||||
|
```
|
||||||
|
Symon (with --headless flag)
|
||||||
|
↓ (OTLP/gRPC on port 4317)
|
||||||
|
OpenTelemetry Collector
|
||||||
|
↓ (Prometheus scrape on port 8889)
|
||||||
|
Prometheus
|
||||||
|
↓ (Query on port 9090)
|
||||||
|
Grafana (accessible on port 3000)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
### 1. Start the observability stack
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd docker-compose
|
||||||
|
docker-compose up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
This will start:
|
||||||
|
- OpenTelemetry Collector on ports 4317 (gRPC), 4318 (HTTP), 8889 (metrics)
|
||||||
|
- Prometheus on port 9090
|
||||||
|
- Grafana on port 3000
|
||||||
|
|
||||||
|
### 2. Build Symon with OpenTelemetry support
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd ..
|
||||||
|
cargo build --release --features opentelemetry
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Create a configuration file
|
||||||
|
|
||||||
|
Create a `Symon-config.toml` file:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[opentelemetry]
|
||||||
|
enabled = true
|
||||||
|
endpoint = "http://localhost:4317"
|
||||||
|
service_name = "Symon-system-monitor"
|
||||||
|
export_interval_ms = 5000
|
||||||
|
|
||||||
|
[opentelemetry.metrics]
|
||||||
|
cpu = true
|
||||||
|
memory = true
|
||||||
|
network = true
|
||||||
|
disk = true
|
||||||
|
processes = true
|
||||||
|
temperature = true
|
||||||
|
gpu = true
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4. Run Symon in headless mode
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./target/release/btm --config Symon-config.toml --headless
|
||||||
|
```
|
||||||
|
|
||||||
|
Or without config file:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
|
||||||
|
./target/release/btm --headless
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Access the dashboards
|
||||||
|
|
||||||
|
- **Prometheus**: http://localhost:9090
|
||||||
|
- **Grafana**: http://localhost:3000 (username: `admin`, password: `admin`)
|
||||||
|
|
||||||
|
## Configuration Files
|
||||||
|
|
||||||
|
### otel-collector-config.yml
|
||||||
|
|
||||||
|
Configures the OpenTelemetry Collector to:
|
||||||
|
- Receive OTLP data on ports 4317 (gRPC) and 4318 (HTTP)
|
||||||
|
- Export metrics in Prometheus format on port 9090
|
||||||
|
- Debug log all received data
|
||||||
|
|
||||||
|
### prometheus.yml
|
||||||
|
|
||||||
|
Configures Prometheus to:
|
||||||
|
- Scrape metrics from the OTEL Collector every 10 seconds
|
||||||
|
- Load alerting rules from `rules/Symon_rules.yml`
|
||||||
|
|
||||||
|
### rules/Symon_rules.yml
|
||||||
|
|
||||||
|
Contains Prometheus recording rules for Symon metrics, including:
|
||||||
|
- Recent process CPU usage metrics
|
||||||
|
- Recent process memory usage metrics
|
||||||
|
|
||||||
|
## Viewing Metrics in Prometheus
|
||||||
|
|
||||||
|
1. Go to http://localhost:9090
|
||||||
|
2. Click on "Graph"
|
||||||
|
3. Try these example queries:
|
||||||
|
|
||||||
|
```promql
|
||||||
|
# CPU usage by core
|
||||||
|
system_cpu_usage_percent
|
||||||
|
|
||||||
|
# Memory usage
|
||||||
|
system_memory_usage_bytes
|
||||||
|
|
||||||
|
# Network RX/TX
|
||||||
|
system_network_rx_bytes
|
||||||
|
system_network_tx_bytes
|
||||||
|
|
||||||
|
# Disk usage
|
||||||
|
system_disk_usage_bytes
|
||||||
|
|
||||||
|
# Top processes by CPU
|
||||||
|
topk(10, system_process_cpu_usage_percent)
|
||||||
|
|
||||||
|
# Top processes by memory
|
||||||
|
topk(10, system_process_memory_usage_bytes)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Grafana Configuration
|
||||||
|
|
||||||
|
Grafana is automatically configured with:
|
||||||
|
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
||||||
|
- **Symon System Overview dashboard** - pre-loaded
|
||||||
|
|
||||||
|
To access:
|
||||||
|
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
||||||
|
2. Navigate to Dashboards → Browse → "Symon System Overview"
|
||||||
|
|
||||||
|
The dashboard includes:
|
||||||
|
- CPU usage by core
|
||||||
|
- Memory usage (RAM/Swap)
|
||||||
|
- Network traffic
|
||||||
|
- Disk usage
|
||||||
|
- Top 10 processes by CPU
|
||||||
|
- Top 10 processes by Memory
|
||||||
|
|
||||||
|
## Stopping the Stack
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose down
|
||||||
|
```
|
||||||
|
|
||||||
|
To also remove volumes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose down -v
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Symon not sending metrics
|
||||||
|
|
||||||
|
Check the OTEL Collector logs:
|
||||||
|
```bash
|
||||||
|
docker-compose logs -f otel-collector
|
||||||
|
```
|
||||||
|
|
||||||
|
You should see messages about receiving metrics.
|
||||||
|
|
||||||
|
### Prometheus not scraping
|
||||||
|
|
||||||
|
1. Check Prometheus targets at http://localhost:9090/targets
|
||||||
|
2. The `otel-collector` target should be UP
|
||||||
|
|
||||||
|
### No data in Grafana
|
||||||
|
|
||||||
|
1. Verify Prometheus data source is configured correctly
|
||||||
|
2. Check that Prometheus has data by querying directly
|
||||||
|
3. Ensure your time range in Grafana includes when Symon was running
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### Using with TimescaleDB (optional)
|
||||||
|
|
||||||
|
A TimescaleDB configuration file is available as `docker-compose-timescale.yml.ko` for long-term storage of metrics. Rename it to include it in your stack.
|
||||||
|
|
||||||
|
### Custom Prometheus Rules
|
||||||
|
|
||||||
|
Edit `rules/Symon_rules.yml` to add custom recording or alerting rules.
|
||||||
|
|
||||||
|
### OTEL Collector Sampling
|
||||||
|
|
||||||
|
Edit `otel-collector-config.yml` to adjust the batch processor settings for different performance characteristics.
|
||||||
169
docker-compose-alloy/alloy-config.alloy
Normal file
169
docker-compose-alloy/alloy-config.alloy
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
// Grafana Alloy Configuration
|
||||||
|
// Sostituisce: Promtail + Node Exporter + OTEL Collector
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// LOGGING - Raccolta log dai container Docker (sostituisce Promtail)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Scopri container Docker con label logging=promtail
|
||||||
|
discovery.docker "containers" {
|
||||||
|
host = "unix:///var/run/docker.sock"
|
||||||
|
|
||||||
|
filter {
|
||||||
|
name = "label"
|
||||||
|
values = ["logging=promtail"]
|
||||||
|
}
|
||||||
|
|
||||||
|
refresh_interval = "5s"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scrape log dai container scoperti
|
||||||
|
loki.source.docker "containers" {
|
||||||
|
host = "unix:///var/run/docker.sock"
|
||||||
|
targets = discovery.docker.containers.targets
|
||||||
|
forward_to = [loki.relabel.docker.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relabeling per aggiungere label ai log
|
||||||
|
loki.relabel "docker" {
|
||||||
|
forward_to = [loki.write.default.receiver]
|
||||||
|
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_docker_container_name"]
|
||||||
|
regex = "/(.*)"
|
||||||
|
target_label = "container"
|
||||||
|
}
|
||||||
|
|
||||||
|
rule {
|
||||||
|
source_labels = ["__meta_docker_container_label_logging_jobname"]
|
||||||
|
target_label = "job"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scrivi i log a Loki
|
||||||
|
loki.write "default" {
|
||||||
|
endpoint {
|
||||||
|
url = "http://loki:3100/loki/api/v1/push"
|
||||||
|
}
|
||||||
|
|
||||||
|
external_labels = {
|
||||||
|
environment = "production",
|
||||||
|
cluster = "myapp-cluster",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// METRICS - Metriche sistema host (sostituisce Node Exporter)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Scraping metriche locali dell'host
|
||||||
|
prometheus.exporter.unix "host" {
|
||||||
|
// Raccoglie metriche del sistema operativo
|
||||||
|
set_collectors = [
|
||||||
|
"cpu",
|
||||||
|
"loadavg",
|
||||||
|
"meminfo",
|
||||||
|
"netdev",
|
||||||
|
"diskstats",
|
||||||
|
"filesystem",
|
||||||
|
"uname",
|
||||||
|
"time",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scrape delle metriche raccolte
|
||||||
|
prometheus.scrape "host_metrics" {
|
||||||
|
targets = prometheus.exporter.unix.host.targets
|
||||||
|
forward_to = [prometheus.remote_write.default.receiver]
|
||||||
|
|
||||||
|
scrape_interval = "10s"
|
||||||
|
|
||||||
|
clustering {
|
||||||
|
enabled = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scraping self-monitoring di Alloy
|
||||||
|
prometheus.scrape "alloy_metrics" {
|
||||||
|
targets = [{
|
||||||
|
__address__ = "localhost:12345",
|
||||||
|
}]
|
||||||
|
forward_to = [prometheus.remote_write.default.receiver]
|
||||||
|
|
||||||
|
scrape_interval = "10s"
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// TRACES - Ricezione traces OTLP (sostituisce OTEL Collector)
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
// Ricevi traces via OTLP gRPC
|
||||||
|
otelcol.receiver.otlp "default" {
|
||||||
|
grpc {
|
||||||
|
endpoint = "0.0.0.0:4317"
|
||||||
|
}
|
||||||
|
|
||||||
|
http {
|
||||||
|
endpoint = "0.0.0.0:4318"
|
||||||
|
}
|
||||||
|
|
||||||
|
output {
|
||||||
|
metrics = [otelcol.processor.batch.default.input]
|
||||||
|
logs = [otelcol.processor.batch.default.input]
|
||||||
|
traces = [otelcol.processor.batch.default.input]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Batch processor per ottimizzare l'invio
|
||||||
|
otelcol.processor.batch "default" {
|
||||||
|
timeout = "10s"
|
||||||
|
send_batch_size = 10000
|
||||||
|
send_batch_max_size = 11000
|
||||||
|
|
||||||
|
output {
|
||||||
|
metrics = [otelcol.exporter.prometheus.default.input]
|
||||||
|
traces = [otelcol.exporter.otlp.tempo.input]
|
||||||
|
logs = [otelcol.exporter.loki.default.input]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Esporta metriche OTLP a Prometheus
|
||||||
|
otelcol.exporter.prometheus "default" {
|
||||||
|
forward_to = [prometheus.remote_write.default.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Esporta traces a Tempo
|
||||||
|
otelcol.exporter.otlp "tempo" {
|
||||||
|
client {
|
||||||
|
endpoint = "tempo:4317"
|
||||||
|
tls {
|
||||||
|
insecure = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Esporta log OTLP a Loki
|
||||||
|
otelcol.exporter.loki "default" {
|
||||||
|
forward_to = [loki.write.default.receiver]
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// REMOTE WRITE - Invia metriche a Prometheus
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
prometheus.remote_write "default" {
|
||||||
|
endpoint {
|
||||||
|
url = "http://prometheus:9090/api/v1/write"
|
||||||
|
|
||||||
|
metadata_config {
|
||||||
|
send_interval = "1m"
|
||||||
|
}
|
||||||
|
|
||||||
|
queue_config {
|
||||||
|
capacity = 10000
|
||||||
|
max_shards = 10
|
||||||
|
min_shards = 1
|
||||||
|
max_samples_per_send = 5000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
120
docker-compose-alloy/docker-compose.yml
Normal file
120
docker-compose-alloy/docker-compose.yml
Normal file
@@ -0,0 +1,120 @@
|
|||||||
|
services:
|
||||||
|
|
||||||
|
alloy:
|
||||||
|
image: grafana/alloy:latest
|
||||||
|
container_name: alloy
|
||||||
|
command:
|
||||||
|
- run
|
||||||
|
- /etc/alloy/config.alloy
|
||||||
|
- --server.http.listen-addr=0.0.0.0:12345
|
||||||
|
- --storage.path=/var/lib/alloy/data
|
||||||
|
volumes:
|
||||||
|
- ./alloy-config.alloy:/etc/alloy/config.alloy:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
ports:
|
||||||
|
- "4317:4317" # OTLP gRPC
|
||||||
|
- "4318:4318" # OTLP HTTP
|
||||||
|
- "12345:12345" # Alloy UI and metrics
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
restart: unless-stopped
|
||||||
|
privileged: true
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus:latest
|
||||||
|
container_name: prometheus
|
||||||
|
volumes:
|
||||||
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||||
|
- ./rules:/etc/prometheus/rules
|
||||||
|
ports:
|
||||||
|
- "9090:9090" # Interfaccia Web di Prometheus
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--web.enable-remote-write-receiver'
|
||||||
|
- '--enable-feature=exemplar-storage'
|
||||||
|
depends_on:
|
||||||
|
- alloy
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
image: grafana/grafana:latest
|
||||||
|
container_name: grafana
|
||||||
|
ports:
|
||||||
|
- "3000:3000"
|
||||||
|
environment:
|
||||||
|
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||||
|
- GF_SECURITY_ADMIN_USER=admin
|
||||||
|
volumes:
|
||||||
|
- grafana-storage:/var/lib/grafana
|
||||||
|
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||||
|
depends_on:
|
||||||
|
- prometheus
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
|
||||||
|
loki:
|
||||||
|
image: grafana/loki:latest
|
||||||
|
container_name: loki
|
||||||
|
user: "0"
|
||||||
|
ports:
|
||||||
|
- "3100:3100"
|
||||||
|
volumes:
|
||||||
|
- ./loki-config.yml:/etc/loki/local-config.yaml
|
||||||
|
- ./loki-data:/loki
|
||||||
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
|
||||||
|
tempo:
|
||||||
|
image: grafana/tempo:latest
|
||||||
|
container_name: tempo
|
||||||
|
user: "0"
|
||||||
|
command: ["-config.file=/etc/tempo.yml"]
|
||||||
|
volumes:
|
||||||
|
- ./tempo-config.yml:/etc/tempo.yml
|
||||||
|
- tempo-data:/tmp/tempo
|
||||||
|
ports:
|
||||||
|
- "3200:3200" # Tempo HTTP
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
|
||||||
|
pyroscope:
|
||||||
|
image: grafana/pyroscope:latest
|
||||||
|
container_name: pyroscope
|
||||||
|
ports:
|
||||||
|
- "4040:4040" # Pyroscope UI and API
|
||||||
|
volumes:
|
||||||
|
- pyroscope-data:/var/lib/pyroscope
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "alloy"
|
||||||
|
environment:
|
||||||
|
- PYROSCOPE_LOG_LEVEL=info
|
||||||
|
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
grafana-storage:
|
||||||
|
tempo-data:
|
||||||
|
pyroscope-data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
observ-net:
|
||||||
|
driver: bridge
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
datasources:
|
||||||
|
- name: Prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
timeInterval: 10s
|
||||||
|
queryTimeout: 60s
|
||||||
|
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
maxLines: 1000
|
||||||
|
derivedFields:
|
||||||
|
- datasourceUid: tempo
|
||||||
|
matcherRegex: "trace_?id[\":]\\s*\"?([0-9a-fA-F]+)"
|
||||||
|
name: TraceID
|
||||||
|
url: "$${__value.raw}"
|
||||||
|
|
||||||
|
- name: Tempo
|
||||||
|
type: tempo
|
||||||
|
uid: tempo
|
||||||
|
access: proxy
|
||||||
|
url: http://tempo:3200
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
httpMethod: GET
|
||||||
|
tracesToLogs:
|
||||||
|
datasourceUid: loki
|
||||||
|
mapTagNamesEnabled: true
|
||||||
|
mappedTags:
|
||||||
|
- key: service.name
|
||||||
|
value: service_name
|
||||||
|
spanStartTimeShift: '-1h'
|
||||||
|
spanEndTimeShift: '1h'
|
||||||
|
filterByTraceID: true
|
||||||
|
filterBySpanID: false
|
||||||
|
tracesToMetrics:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
spanStartTimeShift: '-1h'
|
||||||
|
spanEndTimeShift: '1h'
|
||||||
|
tracesToProfiles:
|
||||||
|
datasourceUid: pyroscope
|
||||||
|
tags:
|
||||||
|
- key: service.name
|
||||||
|
value: service_name
|
||||||
|
serviceMap:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
nodeGraph:
|
||||||
|
enabled: true
|
||||||
|
search:
|
||||||
|
hide: false
|
||||||
|
lokiSearch:
|
||||||
|
datasourceUid: loki
|
||||||
|
|
||||||
|
- name: Pyroscope
|
||||||
|
type: grafana-pyroscope-datasource
|
||||||
|
uid: pyroscope
|
||||||
|
access: proxy
|
||||||
|
url: http://pyroscope:4040
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
keepCookies: []
|
||||||
|
minStep: '15s'
|
||||||
43
docker-compose-alloy/loki-config.yml
Normal file
43
docker-compose-alloy/loki-config.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
grpc_listen_port: 9096
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
query_range:
|
||||||
|
results_cache:
|
||||||
|
cache:
|
||||||
|
embedded_cache:
|
||||||
|
enabled: true
|
||||||
|
max_size_mb: 100
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2020-10-24
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
ruler:
|
||||||
|
alertmanager_url: http://localhost:9093
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 720h
|
||||||
|
ingestion_rate_mb: 10
|
||||||
|
ingestion_burst_size_mb: 20
|
||||||
|
volume_enabled: true
|
||||||
22
docker-compose-alloy/prometheus.yml
Normal file
22
docker-compose-alloy/prometheus.yml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
global:
|
||||||
|
scrape_interval: 10s
|
||||||
|
evaluation_interval: 10s
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- /etc/prometheus/rules/*.yml
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
# Job 1: Monitora se Prometheus stesso è attivo
|
||||||
|
- job_name: 'prometheus'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['localhost:9090']
|
||||||
|
|
||||||
|
# Job 2: Scrape metriche di Alloy (self-monitoring)
|
||||||
|
- job_name: 'alloy'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['alloy:12345']
|
||||||
|
labels:
|
||||||
|
instance: 'alloy'
|
||||||
|
|
||||||
|
# NOTA: Le metriche dell'host (ex-Node Exporter) e OTLP arrivano via Remote Write da Alloy
|
||||||
|
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: bottom_process_metrics
|
- name: symon_process_metrics
|
||||||
interval: 30s
|
interval: 30s
|
||||||
rules:
|
rules:
|
||||||
- record: system_process_cpu_usage_percent:recent
|
- record: system_process_cpu_usage_percent:recent
|
||||||
49
docker-compose-alloy/tempo-config.yml
Normal file
49
docker-compose-alloy/tempo-config.yml
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
server:
|
||||||
|
http_listen_port: 3200
|
||||||
|
|
||||||
|
distributor:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
ingester:
|
||||||
|
max_block_duration: 5m
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
block_retention: 48h
|
||||||
|
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: local
|
||||||
|
wal:
|
||||||
|
path: /tmp/tempo/wal
|
||||||
|
local:
|
||||||
|
path: /tmp/tempo/blocks
|
||||||
|
|
||||||
|
query_frontend:
|
||||||
|
search:
|
||||||
|
duration_slo: 5s
|
||||||
|
throughput_bytes_slo: 1.073741824e+09
|
||||||
|
trace_by_id:
|
||||||
|
duration_slo: 5s
|
||||||
|
|
||||||
|
metrics_generator:
|
||||||
|
registry:
|
||||||
|
external_labels:
|
||||||
|
source: tempo
|
||||||
|
cluster: docker-compose
|
||||||
|
storage:
|
||||||
|
path: /tmp/tempo/generator/wal
|
||||||
|
remote_write:
|
||||||
|
- url: http://prometheus:9090/api/v1/write
|
||||||
|
send_exemplars: true
|
||||||
|
|
||||||
|
overrides:
|
||||||
|
defaults:
|
||||||
|
metrics_generator:
|
||||||
|
processors: [service-graphs, span-metrics]
|
||||||
80
docker-compose-alloy/test-stack.sh
Executable file
80
docker-compose-alloy/test-stack.sh
Executable file
@@ -0,0 +1,80 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Test script to verify the observability stack is running correctly
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "🔍 Testing Symon OpenTelemetry Stack..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Colors
|
||||||
|
GREEN='\033[0;32m'
|
||||||
|
RED='\033[0;31m'
|
||||||
|
YELLOW='\033[1;33m'
|
||||||
|
NC='\033[0m' # No Color
|
||||||
|
|
||||||
|
# Test OTEL Collector gRPC endpoint
|
||||||
|
echo -n "Testing OTEL Collector gRPC (port 4317)... "
|
||||||
|
if nc -zv localhost 4317 2>&1 | grep -q "succeeded\|open"; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ FAILED${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test OTEL Collector HTTP endpoint
|
||||||
|
echo -n "Testing OTEL Collector HTTP (port 4318)... "
|
||||||
|
if nc -zv localhost 4318 2>&1 | grep -q "succeeded\|open"; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ FAILED${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test OTEL Collector metrics endpoint
|
||||||
|
echo -n "Testing OTEL Collector metrics (port 8889)... "
|
||||||
|
if curl -s http://localhost:8889/metrics > /dev/null; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ FAILED${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test Prometheus
|
||||||
|
echo -n "Testing Prometheus (port 9090)... "
|
||||||
|
if curl -s http://localhost:9090/-/healthy | grep -q "Prometheus"; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ FAILED${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test Prometheus targets
|
||||||
|
echo -n "Testing Prometheus targets... "
|
||||||
|
TARGETS=$(curl -s http://localhost:9090/api/v1/targets | grep -o '"health":"up"' | wc -l)
|
||||||
|
if [ "$TARGETS" -gt 0 ]; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC} (${TARGETS} targets up)"
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}⚠ WARNING${NC} (no targets up yet - this is normal if just started)"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Test Grafana
|
||||||
|
echo -n "Testing Grafana (port 3000)... "
|
||||||
|
if curl -s http://localhost:3000/api/health | grep -q "ok"; then
|
||||||
|
echo -e "${GREEN}✓ OK${NC}"
|
||||||
|
else
|
||||||
|
echo -e "${RED}✗ FAILED${NC}"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo -e "${GREEN}✓ All tests passed!${NC}"
|
||||||
|
echo ""
|
||||||
|
echo "📊 Access points:"
|
||||||
|
echo " - Prometheus: http://localhost:9090"
|
||||||
|
echo " - Grafana: http://localhost:3000 (admin/admin)"
|
||||||
|
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
||||||
|
echo ""
|
||||||
|
echo "💡 Next steps:"
|
||||||
|
echo " 1. Build Symon with: cargo build --release --features opentelemetry"
|
||||||
|
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
||||||
|
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Bottom OpenTelemetry Metrics Reference
|
# Symon OpenTelemetry Metrics Reference
|
||||||
|
|
||||||
This document lists all metrics exported by Bottom when running with the `opentelemetry` feature enabled.
|
This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled.
|
||||||
|
|
||||||
## System Metrics
|
## System Metrics
|
||||||
|
|
||||||
@@ -106,7 +106,7 @@ sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
|
|||||||
|
|
||||||
## Recording Rules
|
## Recording Rules
|
||||||
|
|
||||||
The following recording rules are pre-configured in Prometheus (see `rules/bottom_rules.yml`):
|
The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`):
|
||||||
|
|
||||||
| Rule Name | Expression | Description |
|
| Rule Name | Expression | Description |
|
||||||
|-----------|------------|-------------|
|
|-----------|------------|-------------|
|
||||||
@@ -166,7 +166,7 @@ system_network_rx_bytes_rate > 10485760
|
|||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
groups:
|
groups:
|
||||||
- name: bottom_alerts
|
- name: Symon_alerts
|
||||||
interval: 30s
|
interval: 30s
|
||||||
rules:
|
rules:
|
||||||
- alert: HighCPUUsage
|
- alert: HighCPUUsage
|
||||||
@@ -208,8 +208,8 @@ groups:
|
|||||||
| `sensor` | Temperature | Temperature sensor name |
|
| `sensor` | Temperature | Temperature sensor name |
|
||||||
| `name` | Process metrics | Process name |
|
| `name` | Process metrics | Process name |
|
||||||
| `pid` | Process metrics | Process ID |
|
| `pid` | Process metrics | Process ID |
|
||||||
| `exported_job` | All | Always "bottom-system-monitor" |
|
| `exported_job` | All | Always "Symon-system-monitor" |
|
||||||
| `otel_scope_name` | All | Always "bottom-system-monitor" |
|
| `otel_scope_name` | All | Always "Symon-system-monitor" |
|
||||||
|
|
||||||
## Data Retention
|
## Data Retention
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
# Bottom OpenTelemetry Docker Compose Setup
|
# Symon OpenTelemetry Docker Compose Setup
|
||||||
|
|
||||||
This directory contains a Docker Compose setup for running an observability stack to monitor Bottom with OpenTelemetry.
|
This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
The stack includes:
|
The stack includes:
|
||||||
|
|
||||||
1. **OpenTelemetry Collector** - Receives metrics from Bottom via OTLP protocol
|
1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol
|
||||||
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
||||||
3. **Grafana** - Visualizes metrics from Prometheus
|
3. **Grafana** - Visualizes metrics from Prometheus
|
||||||
|
|
||||||
```
|
```
|
||||||
Bottom (with --headless flag)
|
Symon (with --headless flag)
|
||||||
↓ (OTLP/gRPC on port 4317)
|
↓ (OTLP/gRPC on port 4317)
|
||||||
OpenTelemetry Collector
|
OpenTelemetry Collector
|
||||||
↓ (Prometheus scrape on port 8889)
|
↓ (Prometheus scrape on port 8889)
|
||||||
@@ -34,7 +34,7 @@ This will start:
|
|||||||
- Prometheus on port 9090
|
- Prometheus on port 9090
|
||||||
- Grafana on port 3000
|
- Grafana on port 3000
|
||||||
|
|
||||||
### 2. Build Bottom with OpenTelemetry support
|
### 2. Build Symon with OpenTelemetry support
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ..
|
cd ..
|
||||||
@@ -43,13 +43,13 @@ cargo build --release --features opentelemetry
|
|||||||
|
|
||||||
### 3. Create a configuration file
|
### 3. Create a configuration file
|
||||||
|
|
||||||
Create a `bottom-config.toml` file:
|
Create a `Symon-config.toml` file:
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
[opentelemetry]
|
[opentelemetry]
|
||||||
enabled = true
|
enabled = true
|
||||||
endpoint = "http://localhost:4317"
|
endpoint = "http://localhost:4317"
|
||||||
service_name = "bottom-system-monitor"
|
service_name = "Symon-system-monitor"
|
||||||
export_interval_ms = 5000
|
export_interval_ms = 5000
|
||||||
|
|
||||||
[opentelemetry.metrics]
|
[opentelemetry.metrics]
|
||||||
@@ -62,10 +62,10 @@ temperature = true
|
|||||||
gpu = true
|
gpu = true
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Run Bottom in headless mode
|
### 4. Run Symon in headless mode
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./target/release/btm --config bottom-config.toml --headless
|
./target/release/btm --config Symon-config.toml --headless
|
||||||
```
|
```
|
||||||
|
|
||||||
Or without config file:
|
Or without config file:
|
||||||
@@ -93,11 +93,11 @@ Configures the OpenTelemetry Collector to:
|
|||||||
|
|
||||||
Configures Prometheus to:
|
Configures Prometheus to:
|
||||||
- Scrape metrics from the OTEL Collector every 10 seconds
|
- Scrape metrics from the OTEL Collector every 10 seconds
|
||||||
- Load alerting rules from `rules/bottom_rules.yml`
|
- Load alerting rules from `rules/Symon_rules.yml`
|
||||||
|
|
||||||
### rules/bottom_rules.yml
|
### rules/Symon_rules.yml
|
||||||
|
|
||||||
Contains Prometheus recording rules for Bottom metrics, including:
|
Contains Prometheus recording rules for Symon metrics, including:
|
||||||
- Recent process CPU usage metrics
|
- Recent process CPU usage metrics
|
||||||
- Recent process memory usage metrics
|
- Recent process memory usage metrics
|
||||||
|
|
||||||
@@ -132,11 +132,11 @@ topk(10, system_process_memory_usage_bytes)
|
|||||||
|
|
||||||
Grafana is automatically configured with:
|
Grafana is automatically configured with:
|
||||||
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
||||||
- **Bottom System Overview dashboard** - pre-loaded
|
- **Symon System Overview dashboard** - pre-loaded
|
||||||
|
|
||||||
To access:
|
To access:
|
||||||
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
||||||
2. Navigate to Dashboards → Browse → "Bottom System Overview"
|
2. Navigate to Dashboards → Browse → "Symon System Overview"
|
||||||
|
|
||||||
The dashboard includes:
|
The dashboard includes:
|
||||||
- CPU usage by core
|
- CPU usage by core
|
||||||
@@ -160,7 +160,7 @@ docker-compose down -v
|
|||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
### Bottom not sending metrics
|
### Symon not sending metrics
|
||||||
|
|
||||||
Check the OTEL Collector logs:
|
Check the OTEL Collector logs:
|
||||||
```bash
|
```bash
|
||||||
@@ -178,7 +178,7 @@ You should see messages about receiving metrics.
|
|||||||
|
|
||||||
1. Verify Prometheus data source is configured correctly
|
1. Verify Prometheus data source is configured correctly
|
||||||
2. Check that Prometheus has data by querying directly
|
2. Check that Prometheus has data by querying directly
|
||||||
3. Ensure your time range in Grafana includes when Bottom was running
|
3. Ensure your time range in Grafana includes when Symon was running
|
||||||
|
|
||||||
## Advanced Configuration
|
## Advanced Configuration
|
||||||
|
|
||||||
@@ -188,7 +188,7 @@ A TimescaleDB configuration file is available as `docker-compose-timescale.yml.k
|
|||||||
|
|
||||||
### Custom Prometheus Rules
|
### Custom Prometheus Rules
|
||||||
|
|
||||||
Edit `rules/bottom_rules.yml` to add custom recording or alerting rules.
|
Edit `rules/Symon_rules.yml` to add custom recording or alerting rules.
|
||||||
|
|
||||||
### OTEL Collector Sampling
|
### OTEL Collector Sampling
|
||||||
|
|
||||||
|
|||||||
@@ -1,61 +0,0 @@
|
|||||||
services:
|
|
||||||
timescaledb:
|
|
||||||
image: timescale/timescaledb-ha:pg15
|
|
||||||
environment:
|
|
||||||
POSTGRES_PASSWORD: password
|
|
||||||
POSTGRES_DB: promscale
|
|
||||||
POSTGRES_USER: postgres
|
|
||||||
ports:
|
|
||||||
- "5432:5432"
|
|
||||||
volumes:
|
|
||||||
- timescale_data:/var/lib/postgresql/data
|
|
||||||
|
|
||||||
promscale:
|
|
||||||
image: timescale/promscale:latest
|
|
||||||
ports:
|
|
||||||
- "9201:9201"
|
|
||||||
depends_on:
|
|
||||||
- timescaledb
|
|
||||||
environment:
|
|
||||||
PROMSCALE_DB_URI: postgres://postgres:password@timescaledb:5432/promscale?sslmode=disable
|
|
||||||
PROMSCALE_STARTUP_INSTALL_EXTENSIONS: "true"
|
|
||||||
restart: on-failure
|
|
||||||
|
|
||||||
otel-collector:
|
|
||||||
image: otel/opentelemetry-collector-contrib:latest
|
|
||||||
container_name: otel-collector
|
|
||||||
command: ["--config=/etc/otel-collector-config.yml"]
|
|
||||||
volumes:
|
|
||||||
- ./otel-collector-config.yml:/etc/otel-collector-config.yml
|
|
||||||
|
|
||||||
ports:
|
|
||||||
- "4317:4317"
|
|
||||||
|
|
||||||
prometheus:
|
|
||||||
image: prom/prometheus:latest
|
|
||||||
container_name: prometheus
|
|
||||||
volumes:
|
|
||||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
|
||||||
- ./rules:/etc/prometheus/rules
|
|
||||||
ports:
|
|
||||||
- "9090:9090" # Interfaccia Web di Prometheus
|
|
||||||
command:
|
|
||||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
||||||
depends_on:
|
|
||||||
- otel-collector
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
image: grafana/grafana:latest
|
|
||||||
ports:
|
|
||||||
- "3000:3000"
|
|
||||||
environment:
|
|
||||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
|
||||||
- GF_SECURITY_ADMIN_USER=admin
|
|
||||||
volumes:
|
|
||||||
- grafana-storage:/var/lib/grafana
|
|
||||||
depends_on:
|
|
||||||
- prometheus
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
grafana-storage:
|
|
||||||
timescale_data:
|
|
||||||
@@ -12,6 +12,8 @@ services:
|
|||||||
- "8889:8889" # Prometheus metrics endpoint
|
- "8889:8889" # Prometheus metrics endpoint
|
||||||
networks:
|
networks:
|
||||||
- observ-net
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
@@ -28,9 +30,12 @@ services:
|
|||||||
- otel-collector
|
- otel-collector
|
||||||
networks:
|
networks:
|
||||||
- observ-net
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
grafana:
|
grafana:
|
||||||
image: grafana/grafana:latest
|
image: grafana/grafana:latest
|
||||||
|
container_name: grafana
|
||||||
ports:
|
ports:
|
||||||
- "3000:3000"
|
- "3000:3000"
|
||||||
environment:
|
environment:
|
||||||
@@ -43,9 +48,97 @@ services:
|
|||||||
- prometheus
|
- prometheus
|
||||||
networks:
|
networks:
|
||||||
- observ-net
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
|
promtail:
|
||||||
|
image: grafana/promtail:2.9.3
|
||||||
|
container_name: promtail
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
TZ: Europe/Rome
|
||||||
|
volumes:
|
||||||
|
- ./promtail-config.yml:/etc/promtail/config.yml:ro
|
||||||
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
depends_on:
|
||||||
|
- loki
|
||||||
|
|
||||||
|
loki:
|
||||||
|
image: grafana/loki:latest
|
||||||
|
container_name: loki
|
||||||
|
user: "0"
|
||||||
|
ports:
|
||||||
|
- "3100:3100"
|
||||||
|
volumes:
|
||||||
|
- ./loki-config.yml:/etc/loki/local-config.yaml
|
||||||
|
- ./loki-data:/loki
|
||||||
|
command: -config.file=/etc/loki/local-config.yaml
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
|
tempo:
|
||||||
|
image: grafana/tempo:latest
|
||||||
|
container_name: tempo
|
||||||
|
user: "0"
|
||||||
|
command: ["-config.file=/etc/tempo.yml"]
|
||||||
|
volumes:
|
||||||
|
- ./tempo-config.yml:/etc/tempo.yml
|
||||||
|
- tempo-data:/tmp/tempo
|
||||||
|
ports:
|
||||||
|
- "3200:3200" # Tempo HTTP
|
||||||
|
- "4317" # OTLP gRPC
|
||||||
|
- "4318" # OTLP HTTP
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter:latest
|
||||||
|
container_name: node-exporter
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
- '--path.rootfs=/rootfs'
|
||||||
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
ports:
|
||||||
|
- "9100:9100"
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
|
||||||
|
pyroscope:
|
||||||
|
image: grafana/pyroscope:latest
|
||||||
|
container_name: pyroscope
|
||||||
|
ports:
|
||||||
|
- "4040:4040" # Pyroscope UI and API
|
||||||
|
volumes:
|
||||||
|
- pyroscope-data:/var/lib/pyroscope
|
||||||
|
restart: unless-stopped
|
||||||
|
networks:
|
||||||
|
- observ-net
|
||||||
|
labels:
|
||||||
|
logging: "promtail"
|
||||||
|
environment:
|
||||||
|
- PYROSCOPE_LOG_LEVEL=info
|
||||||
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
grafana-storage:
|
grafana-storage:
|
||||||
|
tempo-data:
|
||||||
|
pyroscope-data:
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
observ-net:
|
observ-net:
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"title": "Bottom System Overview",
|
"title": "Symon System Overview",
|
||||||
"uid": "bottom-overview",
|
"uid": "syon-overview",
|
||||||
"timezone": "browser",
|
"timezone": "browser",
|
||||||
"schemaVersion": 16,
|
"schemaVersion": 16,
|
||||||
"refresh": "5s",
|
"refresh": "5s",
|
||||||
@@ -66,12 +66,12 @@
|
|||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "system_network_rx_bytes_rate",
|
"expr": "system_network_rx_bytes_per_sec",
|
||||||
"legendFormat": "RX - {{interface}}",
|
"legendFormat": "RX - {{interface}}",
|
||||||
"refId": "RX"
|
"refId": "RX"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "system_network_tx_bytes_rate",
|
"expr": "system_network_tx_bytes_per_sec",
|
||||||
"legendFormat": "TX - {{interface}}",
|
"legendFormat": "TX - {{interface}}",
|
||||||
"refId": "TX"
|
"refId": "TX"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
|
|
||||||
providers:
|
providers:
|
||||||
- name: 'Bottom Dashboards'
|
- name: 'Syon Dashboards'
|
||||||
orgId: 1
|
orgId: 1
|
||||||
folder: ''
|
folder: ''
|
||||||
type: file
|
type: file
|
||||||
|
|||||||
@@ -10,3 +10,65 @@ datasources:
|
|||||||
jsonData:
|
jsonData:
|
||||||
timeInterval: 10s
|
timeInterval: 10s
|
||||||
queryTimeout: 60s
|
queryTimeout: 60s
|
||||||
|
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
access: proxy
|
||||||
|
url: http://loki:3100
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
maxLines: 1000
|
||||||
|
derivedFields:
|
||||||
|
- datasourceUid: tempo
|
||||||
|
matcherRegex: "trace_?id[\":]\\s*\"?([0-9a-fA-F]+)"
|
||||||
|
name: TraceID
|
||||||
|
url: "$${__value.raw}"
|
||||||
|
|
||||||
|
- name: Tempo
|
||||||
|
type: tempo
|
||||||
|
uid: tempo
|
||||||
|
access: proxy
|
||||||
|
url: http://tempo:3200
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
httpMethod: GET
|
||||||
|
tracesToLogs:
|
||||||
|
datasourceUid: loki
|
||||||
|
mapTagNamesEnabled: true
|
||||||
|
mappedTags:
|
||||||
|
- key: service.name
|
||||||
|
value: service_name
|
||||||
|
spanStartTimeShift: '-1h'
|
||||||
|
spanEndTimeShift: '1h'
|
||||||
|
filterByTraceID: true
|
||||||
|
filterBySpanID: false
|
||||||
|
tracesToMetrics:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
spanStartTimeShift: '-1h'
|
||||||
|
spanEndTimeShift: '1h'
|
||||||
|
tracesToProfiles:
|
||||||
|
datasourceUid: pyroscope
|
||||||
|
tags:
|
||||||
|
- key: service.name
|
||||||
|
value: service_name
|
||||||
|
serviceMap:
|
||||||
|
datasourceUid: prometheus
|
||||||
|
nodeGraph:
|
||||||
|
enabled: true
|
||||||
|
search:
|
||||||
|
hide: false
|
||||||
|
lokiSearch:
|
||||||
|
datasourceUid: loki
|
||||||
|
|
||||||
|
- name: Pyroscope
|
||||||
|
type: grafana-pyroscope-datasource
|
||||||
|
uid: pyroscope
|
||||||
|
access: proxy
|
||||||
|
url: http://pyroscope:4040
|
||||||
|
isDefault: false
|
||||||
|
editable: true
|
||||||
|
jsonData:
|
||||||
|
keepCookies: []
|
||||||
|
minStep: '15s'
|
||||||
|
|||||||
43
docker-compose/loki-config.yml
Normal file
43
docker-compose/loki-config.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
grpc_listen_port: 9096
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
query_range:
|
||||||
|
results_cache:
|
||||||
|
cache:
|
||||||
|
embedded_cache:
|
||||||
|
enabled: true
|
||||||
|
max_size_mb: 100
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2020-10-24
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
ruler:
|
||||||
|
alertmanager_url: http://localhost:9093
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 720h
|
||||||
|
ingestion_rate_mb: 10
|
||||||
|
ingestion_burst_size_mb: 20
|
||||||
|
volume_enabled: true
|
||||||
@@ -15,6 +15,10 @@ processors:
|
|||||||
exporters:
|
exporters:
|
||||||
prometheus:
|
prometheus:
|
||||||
endpoint: "0.0.0.0:8889"
|
endpoint: "0.0.0.0:8889"
|
||||||
|
otlp/tempo:
|
||||||
|
endpoint: tempo:4317
|
||||||
|
tls:
|
||||||
|
insecure: true
|
||||||
debug:
|
debug:
|
||||||
verbosity: detailed
|
verbosity: detailed
|
||||||
|
|
||||||
@@ -25,6 +29,11 @@ service:
|
|||||||
processors: [batch]
|
processors: [batch]
|
||||||
exporters: [prometheus, debug]
|
exporters: [prometheus, debug]
|
||||||
|
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [batch]
|
||||||
|
exporters: [otlp/tempo, debug]
|
||||||
|
|
||||||
logs:
|
logs:
|
||||||
receivers: [otlp]
|
receivers: [otlp]
|
||||||
processors: [batch]
|
processors: [batch]
|
||||||
|
|||||||
@@ -19,3 +19,11 @@ scrape_configs:
|
|||||||
# Raggiunge il Collector usando il suo nome di servizio Docker
|
# Raggiunge il Collector usando il suo nome di servizio Docker
|
||||||
- targets: ['otel-collector:8889']
|
- targets: ['otel-collector:8889']
|
||||||
|
|
||||||
|
# Job 3: Node Exporter - Metriche del sistema host
|
||||||
|
- job_name: 'node-exporter'
|
||||||
|
static_configs:
|
||||||
|
- targets: ['node-exporter:9100']
|
||||||
|
labels:
|
||||||
|
instance: 'docker-host'
|
||||||
|
environment: 'production'
|
||||||
|
|
||||||
|
|||||||
27
docker-compose/promtail-config.yml
Normal file
27
docker-compose/promtail-config.yml
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
server:
|
||||||
|
http_listen_port: 9080
|
||||||
|
grpc_listen_port: 0
|
||||||
|
|
||||||
|
positions:
|
||||||
|
filename: /tmp/positions.yaml
|
||||||
|
|
||||||
|
clients:
|
||||||
|
- url: http://loki:3100/loki/api/v1/push
|
||||||
|
external_labels:
|
||||||
|
environment: production
|
||||||
|
cluster: myapp-cluster
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: docker
|
||||||
|
docker_sd_configs:
|
||||||
|
- host: unix:///var/run/docker.sock
|
||||||
|
refresh_interval: 5s
|
||||||
|
filters:
|
||||||
|
- name: label
|
||||||
|
values: ["logging=promtail"]
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: ['__meta_docker_container_name']
|
||||||
|
regex: '/(.*)'
|
||||||
|
target_label: 'container'
|
||||||
|
- source_labels: ['__meta_docker_container_label_logging_jobname']
|
||||||
|
target_label: 'job'
|
||||||
15
docker-compose/rules/symon_rules.yml
Normal file
15
docker-compose/rules/symon_rules.yml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
groups:
|
||||||
|
- name: symon_process_metrics
|
||||||
|
interval: 30s
|
||||||
|
rules:
|
||||||
|
- record: system_process_cpu_usage_percent:recent
|
||||||
|
expr: |
|
||||||
|
system_process_cpu_usage_percent
|
||||||
|
and on(pid, name)
|
||||||
|
(time() - timestamp(system_process_cpu_usage_percent) < 120)
|
||||||
|
|
||||||
|
- record: system_process_memory_usage_bytes:recent
|
||||||
|
expr: |
|
||||||
|
system_process_memory_usage_bytes
|
||||||
|
and on(pid, name)
|
||||||
|
(time() - timestamp(system_process_memory_usage_bytes) < 120)
|
||||||
49
docker-compose/tempo-config.yml
Normal file
49
docker-compose/tempo-config.yml
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
server:
|
||||||
|
http_listen_port: 3200
|
||||||
|
|
||||||
|
distributor:
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
ingester:
|
||||||
|
max_block_duration: 5m
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
compaction:
|
||||||
|
block_retention: 48h
|
||||||
|
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: local
|
||||||
|
wal:
|
||||||
|
path: /tmp/tempo/wal
|
||||||
|
local:
|
||||||
|
path: /tmp/tempo/blocks
|
||||||
|
|
||||||
|
query_frontend:
|
||||||
|
search:
|
||||||
|
duration_slo: 5s
|
||||||
|
throughput_bytes_slo: 1.073741824e+09
|
||||||
|
trace_by_id:
|
||||||
|
duration_slo: 5s
|
||||||
|
|
||||||
|
metrics_generator:
|
||||||
|
registry:
|
||||||
|
external_labels:
|
||||||
|
source: tempo
|
||||||
|
cluster: docker-compose
|
||||||
|
storage:
|
||||||
|
path: /tmp/tempo/generator/wal
|
||||||
|
remote_write:
|
||||||
|
- url: http://prometheus:9090/api/v1/write
|
||||||
|
send_exemplars: true
|
||||||
|
|
||||||
|
overrides:
|
||||||
|
defaults:
|
||||||
|
metrics_generator:
|
||||||
|
processors: [service-graphs, span-metrics]
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
echo "🔍 Testing Bottom OpenTelemetry Stack..."
|
echo "🔍 Testing Symon OpenTelemetry Stack..."
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# Colors
|
# Colors
|
||||||
@@ -75,6 +75,6 @@ echo " - Grafana: http://localhost:3000 (admin/admin)"
|
|||||||
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
||||||
echo ""
|
echo ""
|
||||||
echo "💡 Next steps:"
|
echo "💡 Next steps:"
|
||||||
echo " 1. Build bottom with: cargo build --release --features opentelemetry"
|
echo " 1. Build Symon with: cargo build --release --features opentelemetry"
|
||||||
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
||||||
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
# Example process filter configuration file
|
# Example process filter configuration file
|
||||||
# This file can be included from the main bottom config to keep
|
# This file can be included from the main symon config to keep
|
||||||
# server-specific process lists separate.
|
# server-specific process lists separate.
|
||||||
#
|
#
|
||||||
# Usage in bottom-config.toml:
|
# Usage in symon-config.toml:
|
||||||
# [opentelemetry.metrics.process_filter]
|
# [opentelemetry.metrics.process_filter]
|
||||||
# include = "processes.toml"
|
# include = "processes.toml"
|
||||||
|
|
||||||
# Filter mode: "whitelist" or "blacklist"
|
# Filter mode: "whitelist" or "blacklist"
|
||||||
# - whitelist: Only export metrics for processes in the lists below
|
# - whitelist: Only export metrics for processes in the lists below
|
||||||
# - blacklist: Export metrics for all processes EXCEPT those in the lists
|
# - blacklist: Export metrics for all processes EXCEPT those in the lists
|
||||||
filter_mode = "whitelist"
|
filter_mode = "blacklist"
|
||||||
|
|
||||||
# Process names to monitor (case-insensitive substring match)
|
# Process names to monitor (case-insensitive substring match)
|
||||||
# Examples for common server processes:
|
# Examples for common server processes:
|
||||||
370
src/collector.rs
370
src/collector.rs
@@ -1,6 +1,8 @@
|
|||||||
use crate::config::MetricsConfig;
|
use crate::config::MetricsConfig;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
|
use std::collections::HashMap;
|
||||||
|
use std::time::Instant;
|
||||||
|
use sysinfo::{Disks, Networks, ProcessesToUpdate, System};
|
||||||
|
|
||||||
/// System metrics collected at a point in time
|
/// System metrics collected at a point in time
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -11,6 +13,8 @@ pub struct SystemMetrics {
|
|||||||
pub disk: Option<Vec<DiskMetric>>,
|
pub disk: Option<Vec<DiskMetric>>,
|
||||||
pub processes: Option<Vec<ProcessMetric>>,
|
pub processes: Option<Vec<ProcessMetric>>,
|
||||||
pub temperature: Option<Vec<TemperatureMetric>>,
|
pub temperature: Option<Vec<TemperatureMetric>>,
|
||||||
|
pub load_avg: Option<LoadAvgMetric>,
|
||||||
|
pub disk_io: Option<Vec<DiskIoMetric>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -30,8 +34,12 @@ pub struct MemoryMetric {
|
|||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct NetworkMetric {
|
pub struct NetworkMetric {
|
||||||
pub interface_name: String,
|
pub interface_name: String,
|
||||||
pub rx_bytes_total: u64,
|
pub rx_bytes_per_sec: u64,
|
||||||
pub tx_bytes_total: u64,
|
pub tx_bytes_per_sec: u64,
|
||||||
|
pub rx_packets_per_sec: u64,
|
||||||
|
pub tx_packets_per_sec: u64,
|
||||||
|
pub rx_errors_per_sec: u64,
|
||||||
|
pub tx_errors_per_sec: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -56,34 +64,93 @@ pub struct TemperatureMetric {
|
|||||||
pub temperature_celsius: f32,
|
pub temperature_celsius: f32,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct LoadAvgMetric {
|
||||||
|
pub load1: f64,
|
||||||
|
pub load5: f64,
|
||||||
|
pub load15: f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DiskIoMetric {
|
||||||
|
pub device_name: String,
|
||||||
|
pub read_bytes_per_sec: u64,
|
||||||
|
pub write_bytes_per_sec: u64,
|
||||||
|
pub read_ops_per_sec: u64,
|
||||||
|
pub write_ops_per_sec: u64,
|
||||||
|
}
|
||||||
|
|
||||||
/// Collector for system metrics
|
/// Collector for system metrics
|
||||||
pub struct MetricsCollector {
|
pub struct MetricsCollector {
|
||||||
system: System,
|
system: System,
|
||||||
networks: Networks,
|
networks: Networks,
|
||||||
disks: Disks,
|
disks: Disks,
|
||||||
config: MetricsConfig,
|
config: MetricsConfig,
|
||||||
|
// Network rate calculation state
|
||||||
|
last_network_stats: HashMap<String, NetworkStats>,
|
||||||
|
last_network_time: Option<Instant>,
|
||||||
|
// Disk I/O rate calculation state
|
||||||
|
last_disk_io_stats: HashMap<String, DiskIoStats>,
|
||||||
|
last_disk_io_time: Option<Instant>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct NetworkStats {
|
||||||
|
rx_bytes: u64,
|
||||||
|
tx_bytes: u64,
|
||||||
|
rx_packets: u64,
|
||||||
|
tx_packets: u64,
|
||||||
|
rx_errors: u64,
|
||||||
|
tx_errors: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct DiskIoStats {
|
||||||
|
read_bytes: u64,
|
||||||
|
write_bytes: u64,
|
||||||
|
read_count: u64,
|
||||||
|
write_count: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricsCollector {
|
impl MetricsCollector {
|
||||||
pub fn new(config: MetricsConfig) -> Self {
|
pub fn new(config: MetricsConfig) -> Self {
|
||||||
let refresh_kind = RefreshKind::new()
|
// Initialize with minimal data - we'll refresh on-demand
|
||||||
.with_cpu(CpuRefreshKind::everything())
|
|
||||||
.with_memory(sysinfo::MemoryRefreshKind::everything())
|
|
||||||
.with_processes(sysinfo::ProcessRefreshKind::everything());
|
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
system: System::new_with_specifics(refresh_kind),
|
system: System::new(),
|
||||||
networks: Networks::new_with_refreshed_list(),
|
networks: Networks::new_with_refreshed_list(),
|
||||||
disks: Disks::new_with_refreshed_list(),
|
disks: Disks::new_with_refreshed_list(),
|
||||||
config,
|
config,
|
||||||
|
last_network_stats: HashMap::new(),
|
||||||
|
last_network_time: None,
|
||||||
|
last_disk_io_stats: HashMap::new(),
|
||||||
|
last_disk_io_time: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Collect all enabled metrics
|
/// Collect all enabled metrics
|
||||||
pub fn collect(&mut self) -> Result<SystemMetrics> {
|
pub fn collect(&mut self) -> Result<SystemMetrics> {
|
||||||
// Refresh system info
|
// Refresh only what's needed based on enabled metrics
|
||||||
self.system.refresh_all();
|
if self.config.cpu {
|
||||||
self.networks.refresh();
|
self.system.refresh_cpu_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.memory {
|
||||||
|
self.system.refresh_memory();
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.processes {
|
||||||
|
self.system.refresh_processes(ProcessesToUpdate::All, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.network {
|
||||||
|
self.networks.refresh(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.config.disk {
|
||||||
|
self.disks.refresh(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: Temperature metrics are currently not implemented
|
||||||
|
|
||||||
Ok(SystemMetrics {
|
Ok(SystemMetrics {
|
||||||
cpu: if self.config.cpu {
|
cpu: if self.config.cpu {
|
||||||
@@ -116,6 +183,16 @@ impl MetricsCollector {
|
|||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
},
|
},
|
||||||
|
load_avg: if self.config.load_avg {
|
||||||
|
Some(self.collect_load_avg())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
disk_io: if self.config.disk_io {
|
||||||
|
Some(self.collect_disk_io())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -140,15 +217,72 @@ impl MetricsCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_network(&self) -> Vec<NetworkMetric> {
|
fn collect_network(&mut self) -> Vec<NetworkMetric> {
|
||||||
self.networks
|
let now = Instant::now();
|
||||||
.iter()
|
let mut metrics = Vec::new();
|
||||||
.map(|(interface_name, data)| NetworkMetric {
|
|
||||||
|
// Calculate time delta
|
||||||
|
let time_delta_secs = if let Some(last_time) = self.last_network_time {
|
||||||
|
now.duration_since(last_time).as_secs_f64()
|
||||||
|
} else {
|
||||||
|
// First collection, no rate to calculate
|
||||||
|
self.last_network_time = Some(now);
|
||||||
|
for (interface_name, data) in self.networks.iter() {
|
||||||
|
self.last_network_stats.insert(
|
||||||
|
interface_name.to_string(),
|
||||||
|
NetworkStats {
|
||||||
|
rx_bytes: data.total_received(),
|
||||||
|
tx_bytes: data.total_transmitted(),
|
||||||
|
rx_packets: data.total_packets_received(),
|
||||||
|
tx_packets: data.total_packets_transmitted(),
|
||||||
|
rx_errors: data.total_errors_on_received(),
|
||||||
|
tx_errors: data.total_errors_on_transmitted(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return metrics; // Return empty on first run
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update timestamp
|
||||||
|
self.last_network_time = Some(now);
|
||||||
|
|
||||||
|
// Calculate rates for each interface
|
||||||
|
for (interface_name, data) in self.networks.iter() {
|
||||||
|
let current_stats = NetworkStats {
|
||||||
|
rx_bytes: data.total_received(),
|
||||||
|
tx_bytes: data.total_transmitted(),
|
||||||
|
rx_packets: data.total_packets_received(),
|
||||||
|
tx_packets: data.total_packets_transmitted(),
|
||||||
|
rx_errors: data.total_errors_on_received(),
|
||||||
|
tx_errors: data.total_errors_on_transmitted(),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(last_stats) = self.last_network_stats.get(interface_name.as_str()) {
|
||||||
|
// Calculate rates per second
|
||||||
|
metrics.push(NetworkMetric {
|
||||||
interface_name: interface_name.to_string(),
|
interface_name: interface_name.to_string(),
|
||||||
rx_bytes_total: data.total_received(),
|
rx_bytes_per_sec: Self::calculate_rate(current_stats.rx_bytes, last_stats.rx_bytes, time_delta_secs),
|
||||||
tx_bytes_total: data.total_transmitted(),
|
tx_bytes_per_sec: Self::calculate_rate(current_stats.tx_bytes, last_stats.tx_bytes, time_delta_secs),
|
||||||
})
|
rx_packets_per_sec: Self::calculate_rate(current_stats.rx_packets, last_stats.rx_packets, time_delta_secs),
|
||||||
.collect()
|
tx_packets_per_sec: Self::calculate_rate(current_stats.tx_packets, last_stats.tx_packets, time_delta_secs),
|
||||||
|
rx_errors_per_sec: Self::calculate_rate(current_stats.rx_errors, last_stats.rx_errors, time_delta_secs),
|
||||||
|
tx_errors_per_sec: Self::calculate_rate(current_stats.tx_errors, last_stats.tx_errors, time_delta_secs),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update last stats
|
||||||
|
self.last_network_stats.insert(interface_name.to_string(), current_stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
fn calculate_rate(current: u64, last: u64, time_delta: f64) -> u64 {
|
||||||
|
if current >= last {
|
||||||
|
((current - last) as f64 / time_delta) as u64
|
||||||
|
} else {
|
||||||
|
0 // Counter wrapped or interface reset
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_disk(&self) -> Vec<DiskMetric> {
|
fn collect_disk(&self) -> Vec<DiskMetric> {
|
||||||
@@ -171,36 +305,62 @@ impl MetricsCollector {
|
|||||||
|
|
||||||
fn collect_processes(&self) -> Vec<ProcessMetric> {
|
fn collect_processes(&self) -> Vec<ProcessMetric> {
|
||||||
let filter = self.config.process_filter.as_ref();
|
let filter = self.config.process_filter.as_ref();
|
||||||
|
let max_processes = filter.map(|f| f.max_processes).unwrap_or(10);
|
||||||
|
|
||||||
let mut processes: Vec<ProcessMetric> = self
|
// Pre-allocate with expected capacity
|
||||||
.system
|
let mut processes: Vec<ProcessMetric> = Vec::with_capacity(max_processes);
|
||||||
.processes()
|
|
||||||
.iter()
|
// Collect only processes that pass the filter
|
||||||
.filter(|(_, process)| {
|
for (_, process) in self.system.processes().iter() {
|
||||||
|
// Skip if filter rejects this process
|
||||||
if let Some(filter_config) = filter {
|
if let Some(filter_config) = filter {
|
||||||
filter_config.should_include_process(
|
let process_name = process.name().to_string_lossy();
|
||||||
process.name().to_string_lossy().as_ref(),
|
if !filter_config.should_include_process(process_name.as_ref(), process.pid().as_u32()) {
|
||||||
process.pid().as_u32(),
|
continue;
|
||||||
)
|
|
||||||
} else {
|
|
||||||
true
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
.map(|(_, process)| ProcessMetric {
|
|
||||||
|
let cpu_usage = process.cpu_usage();
|
||||||
|
|
||||||
|
// If we haven't reached max_processes yet, just add it
|
||||||
|
if processes.len() < max_processes {
|
||||||
|
processes.push(ProcessMetric {
|
||||||
pid: process.pid().as_u32(),
|
pid: process.pid().as_u32(),
|
||||||
name: process.name().to_string_lossy().to_string(),
|
name: process.name().to_string_lossy().to_string(),
|
||||||
cpu_usage_percent: process.cpu_usage(),
|
cpu_usage_percent: cpu_usage,
|
||||||
memory_bytes: process.memory(),
|
memory_bytes: process.memory(),
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
// Find the process with minimum CPU usage in our list
|
||||||
|
if let Some(min_idx) = processes
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.min_by(|(_, a), (_, b)| {
|
||||||
|
a.cpu_usage_percent
|
||||||
|
.partial_cmp(&b.cpu_usage_percent)
|
||||||
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
})
|
})
|
||||||
.collect();
|
.map(|(idx, _)| idx)
|
||||||
|
{
|
||||||
|
// Replace if current process has higher CPU usage
|
||||||
|
if cpu_usage > processes[min_idx].cpu_usage_percent {
|
||||||
|
processes[min_idx] = ProcessMetric {
|
||||||
|
pid: process.pid().as_u32(),
|
||||||
|
name: process.name().to_string_lossy().to_string(),
|
||||||
|
cpu_usage_percent: cpu_usage,
|
||||||
|
memory_bytes: process.memory(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sort by CPU usage and limit to top 10
|
// Final sort by CPU usage (descending)
|
||||||
processes.sort_by(|a, b| {
|
processes.sort_by(|a, b| {
|
||||||
b.cpu_usage_percent
|
b.cpu_usage_percent
|
||||||
.partial_cmp(&a.cpu_usage_percent)
|
.partial_cmp(&a.cpu_usage_percent)
|
||||||
.unwrap_or(std::cmp::Ordering::Equal)
|
.unwrap_or(std::cmp::Ordering::Equal)
|
||||||
});
|
});
|
||||||
processes.truncate(10);
|
|
||||||
|
|
||||||
processes
|
processes
|
||||||
}
|
}
|
||||||
@@ -211,4 +371,140 @@ impl MetricsCollector {
|
|||||||
// For now, return empty vector
|
// For now, return empty vector
|
||||||
vec![]
|
vec![]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn collect_load_avg(&self) -> LoadAvgMetric {
|
||||||
|
let load_avg = System::load_average();
|
||||||
|
LoadAvgMetric {
|
||||||
|
load1: load_avg.one,
|
||||||
|
load5: load_avg.five,
|
||||||
|
load15: load_avg.fifteen,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_disk_io(&mut self) -> Vec<DiskIoMetric> {
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
{
|
||||||
|
self.collect_disk_io_linux()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
{
|
||||||
|
// Disk I/O metrics only supported on Linux for now
|
||||||
|
vec![]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn collect_disk_io_linux(&mut self) -> Vec<DiskIoMetric> {
|
||||||
|
let now = Instant::now();
|
||||||
|
let mut metrics = Vec::new();
|
||||||
|
|
||||||
|
// Calculate time delta
|
||||||
|
let time_delta_secs = if let Some(last_time) = self.last_disk_io_time {
|
||||||
|
now.duration_since(last_time).as_secs_f64()
|
||||||
|
} else {
|
||||||
|
// First collection, just store the values
|
||||||
|
self.last_disk_io_time = Some(now);
|
||||||
|
if let Ok(stats) = Self::read_diskstats() {
|
||||||
|
self.last_disk_io_stats = stats;
|
||||||
|
}
|
||||||
|
return metrics; // Return empty on first run
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update timestamp
|
||||||
|
self.last_disk_io_time = Some(now);
|
||||||
|
|
||||||
|
// Read current disk stats
|
||||||
|
let current_stats = match Self::read_diskstats() {
|
||||||
|
Ok(stats) => stats,
|
||||||
|
Err(_) => return metrics,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Calculate rates for each disk
|
||||||
|
for (device_name, current) in ¤t_stats {
|
||||||
|
if let Some(last) = self.last_disk_io_stats.get(device_name) {
|
||||||
|
// Skip if device is a partition (has digits at the end) - we only want whole disks
|
||||||
|
// unless it's nvme, loop, or similar devices
|
||||||
|
if Self::should_include_device(device_name) {
|
||||||
|
metrics.push(DiskIoMetric {
|
||||||
|
device_name: device_name.clone(),
|
||||||
|
read_bytes_per_sec: Self::calculate_rate(current.read_bytes, last.read_bytes, time_delta_secs),
|
||||||
|
write_bytes_per_sec: Self::calculate_rate(current.write_bytes, last.write_bytes, time_delta_secs),
|
||||||
|
read_ops_per_sec: Self::calculate_rate(current.read_count, last.read_count, time_delta_secs),
|
||||||
|
write_ops_per_sec: Self::calculate_rate(current.write_count, last.write_count, time_delta_secs),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update last stats
|
||||||
|
self.last_disk_io_stats = current_stats;
|
||||||
|
|
||||||
|
metrics
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn read_diskstats() -> std::io::Result<HashMap<String, DiskIoStats>> {
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
|
||||||
|
let file = File::open("/proc/diskstats")?;
|
||||||
|
let reader = BufReader::new(file);
|
||||||
|
let mut stats = HashMap::new();
|
||||||
|
|
||||||
|
for line in reader.lines() {
|
||||||
|
let line = line?;
|
||||||
|
let fields: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
|
||||||
|
// /proc/diskstats format:
|
||||||
|
// major minor name reads reads_merged sectors_read time_reading writes writes_merged sectors_written time_writing ...
|
||||||
|
// We need: name (field 2), reads (field 3), sectors_read (field 5), writes (field 7), sectors_written (field 9)
|
||||||
|
if fields.len() >= 14 {
|
||||||
|
let device_name = fields[2].to_string();
|
||||||
|
|
||||||
|
// Parse fields (with error handling)
|
||||||
|
let reads = fields[3].parse::<u64>().unwrap_or(0);
|
||||||
|
let sectors_read = fields[5].parse::<u64>().unwrap_or(0);
|
||||||
|
let writes = fields[7].parse::<u64>().unwrap_or(0);
|
||||||
|
let sectors_written = fields[9].parse::<u64>().unwrap_or(0);
|
||||||
|
|
||||||
|
// Sector size is typically 512 bytes
|
||||||
|
let read_bytes = sectors_read * 512;
|
||||||
|
let write_bytes = sectors_written * 512;
|
||||||
|
|
||||||
|
stats.insert(
|
||||||
|
device_name,
|
||||||
|
DiskIoStats {
|
||||||
|
read_bytes,
|
||||||
|
write_bytes,
|
||||||
|
read_count: reads,
|
||||||
|
write_count: writes,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(stats)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn should_include_device(device_name: &str) -> bool {
|
||||||
|
// Include whole disks: sda, nvme0n1, vda, hda, etc.
|
||||||
|
// Exclude partitions: sda1, nvme0n1p1, vda1, etc.
|
||||||
|
// Also exclude loop devices, ram devices, and other virtual devices
|
||||||
|
|
||||||
|
if device_name.starts_with("loop") ||
|
||||||
|
device_name.starts_with("ram") ||
|
||||||
|
device_name.starts_with("dm-") {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For nvme devices: include nvme0n1 but exclude nvme0n1p1
|
||||||
|
if device_name.starts_with("nvme") {
|
||||||
|
return !device_name.contains('p');
|
||||||
|
}
|
||||||
|
|
||||||
|
// For standard devices (sd*, vd*, hd*): check if last char is a digit
|
||||||
|
!device_name.chars().last().map(|c| c.is_ascii_digit()).unwrap_or(false)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -166,6 +166,14 @@ pub struct MetricsConfig {
|
|||||||
#[serde(default = "default_true")]
|
#[serde(default = "default_true")]
|
||||||
pub temperature: bool,
|
pub temperature: bool,
|
||||||
|
|
||||||
|
/// Export load average metrics
|
||||||
|
#[serde(default = "default_true")]
|
||||||
|
pub load_avg: bool,
|
||||||
|
|
||||||
|
/// Export disk I/O metrics
|
||||||
|
#[serde(default = "default_true")]
|
||||||
|
pub disk_io: bool,
|
||||||
|
|
||||||
/// Process filter configuration
|
/// Process filter configuration
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub process_filter: Option<ProcessFilterConfig>,
|
pub process_filter: Option<ProcessFilterConfig>,
|
||||||
@@ -180,6 +188,8 @@ impl Default for MetricsConfig {
|
|||||||
disk: true,
|
disk: true,
|
||||||
processes: false,
|
processes: false,
|
||||||
temperature: true,
|
temperature: true,
|
||||||
|
load_avg: true,
|
||||||
|
disk_io: true,
|
||||||
process_filter: None,
|
process_filter: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -195,6 +205,10 @@ pub struct ProcessFilterConfig {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub filter_mode: Option<ProcessFilterMode>,
|
pub filter_mode: Option<ProcessFilterMode>,
|
||||||
|
|
||||||
|
/// Maximum number of processes to report (top N by CPU usage)
|
||||||
|
#[serde(default = "default_max_processes")]
|
||||||
|
pub max_processes: usize,
|
||||||
|
|
||||||
/// List of process names to filter (case-insensitive substring match)
|
/// List of process names to filter (case-insensitive substring match)
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub names: Vec<String>,
|
pub names: Vec<String>,
|
||||||
@@ -236,6 +250,7 @@ impl ProcessFilterConfig {
|
|||||||
let mut merged = Self {
|
let mut merged = Self {
|
||||||
include: None,
|
include: None,
|
||||||
filter_mode: included.filter_mode.or(self.filter_mode),
|
filter_mode: included.filter_mode.or(self.filter_mode),
|
||||||
|
max_processes: included.max_processes,
|
||||||
names: if included.names.is_empty() {
|
names: if included.names.is_empty() {
|
||||||
self.names.clone()
|
self.names.clone()
|
||||||
} else {
|
} else {
|
||||||
@@ -345,3 +360,7 @@ fn default_timeout() -> u64 {
|
|||||||
fn default_true() -> bool {
|
fn default_true() -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn default_max_processes() -> usize {
|
||||||
|
10
|
||||||
|
}
|
||||||
|
|||||||
131
src/exporter.rs
131
src/exporter.rs
@@ -18,13 +18,24 @@ struct MetricInstruments {
|
|||||||
memory_total: opentelemetry::metrics::Gauge<u64>,
|
memory_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
swap_usage: opentelemetry::metrics::Gauge<u64>,
|
swap_usage: opentelemetry::metrics::Gauge<u64>,
|
||||||
swap_total: opentelemetry::metrics::Gauge<u64>,
|
swap_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
network_rx: opentelemetry::metrics::Counter<u64>,
|
network_rx: opentelemetry::metrics::Gauge<u64>,
|
||||||
network_tx: opentelemetry::metrics::Counter<u64>,
|
network_tx: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
network_rx_packets: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
network_tx_packets: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
network_rx_errors: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
network_tx_errors: opentelemetry::metrics::Gauge<u64>,
|
||||||
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
||||||
disk_total: opentelemetry::metrics::Gauge<u64>,
|
disk_total: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
disk_io_read_bytes: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
disk_io_write_bytes: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
disk_io_read_ops: opentelemetry::metrics::Gauge<u64>,
|
||||||
|
disk_io_write_ops: opentelemetry::metrics::Gauge<u64>,
|
||||||
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
||||||
process_memory: opentelemetry::metrics::Gauge<u64>,
|
process_memory: opentelemetry::metrics::Gauge<u64>,
|
||||||
temperature: opentelemetry::metrics::Gauge<f64>,
|
temperature: opentelemetry::metrics::Gauge<f64>,
|
||||||
|
load_avg_1: opentelemetry::metrics::Gauge<f64>,
|
||||||
|
load_avg_5: opentelemetry::metrics::Gauge<f64>,
|
||||||
|
load_avg_15: opentelemetry::metrics::Gauge<f64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricsExporter {
|
impl MetricsExporter {
|
||||||
@@ -40,20 +51,20 @@ impl MetricsExporter {
|
|||||||
resource_kvs.push(KeyValue::new(key.clone(), value.clone()));
|
resource_kvs.push(KeyValue::new(key.clone(), value.clone()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let resource = Resource::new(resource_kvs);
|
let resource = Resource::builder_empty()
|
||||||
|
.with_attributes(resource_kvs)
|
||||||
|
.build();
|
||||||
|
|
||||||
// Build OTLP exporter using new pipeline API
|
// Build OTLP exporter using new pipeline API
|
||||||
let exporter = opentelemetry_otlp::new_exporter()
|
let exporter = opentelemetry_otlp::MetricExporter::builder()
|
||||||
.tonic()
|
.with_tonic()
|
||||||
.with_endpoint(&config.endpoint)
|
.with_endpoint(&config.endpoint)
|
||||||
.with_timeout(config.export_timeout())
|
.with_timeout(config.export_timeout())
|
||||||
.build_metrics_exporter(
|
.build()
|
||||||
Box::new(opentelemetry_sdk::metrics::reader::DefaultTemporalitySelector::default())
|
|
||||||
)
|
|
||||||
.context("Failed to build OTLP metrics exporter")?;
|
.context("Failed to build OTLP metrics exporter")?;
|
||||||
|
|
||||||
// Build meter provider
|
// Build meter provider
|
||||||
let reader = PeriodicReader::builder(exporter, opentelemetry_sdk::runtime::Tokio)
|
let reader = PeriodicReader::builder(exporter)
|
||||||
.with_interval(config.export_interval())
|
.with_interval(config.export_interval())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
@@ -69,51 +80,95 @@ impl MetricsExporter {
|
|||||||
cpu_usage: meter
|
cpu_usage: meter
|
||||||
.f64_gauge("system_cpu_usage_percent")
|
.f64_gauge("system_cpu_usage_percent")
|
||||||
.with_description("CPU usage percentage per core")
|
.with_description("CPU usage percentage per core")
|
||||||
.init(),
|
.build(),
|
||||||
memory_usage: meter
|
memory_usage: meter
|
||||||
.u64_gauge("system_memory_usage_bytes")
|
.u64_gauge("system_memory_usage_bytes")
|
||||||
.with_description("Memory usage in bytes")
|
.with_description("Memory usage in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
memory_total: meter
|
memory_total: meter
|
||||||
.u64_gauge("system_memory_total_bytes")
|
.u64_gauge("system_memory_total_bytes")
|
||||||
.with_description("Total memory in bytes")
|
.with_description("Total memory in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
swap_usage: meter
|
swap_usage: meter
|
||||||
.u64_gauge("system_swap_usage_bytes")
|
.u64_gauge("system_swap_usage_bytes")
|
||||||
.with_description("Swap usage in bytes")
|
.with_description("Swap usage in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
swap_total: meter
|
swap_total: meter
|
||||||
.u64_gauge("system_swap_total_bytes")
|
.u64_gauge("system_swap_total_bytes")
|
||||||
.with_description("Total swap in bytes")
|
.with_description("Total swap in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
network_rx: meter
|
network_rx: meter
|
||||||
.u64_counter("system_network_rx_bytes_total")
|
.u64_gauge("system_network_rx_bytes_per_sec")
|
||||||
.with_description("Total bytes received")
|
.with_description("Bytes received per second")
|
||||||
.init(),
|
.build(),
|
||||||
network_tx: meter
|
network_tx: meter
|
||||||
.u64_counter("system_network_tx_bytes_total")
|
.u64_gauge("system_network_tx_bytes_per_sec")
|
||||||
.with_description("Total bytes transmitted")
|
.with_description("Bytes transmitted per second")
|
||||||
.init(),
|
.build(),
|
||||||
|
network_rx_packets: meter
|
||||||
|
.u64_gauge("system_network_rx_packets_per_sec")
|
||||||
|
.with_description("Packets received per second")
|
||||||
|
.build(),
|
||||||
|
network_tx_packets: meter
|
||||||
|
.u64_gauge("system_network_tx_packets_per_sec")
|
||||||
|
.with_description("Packets transmitted per second")
|
||||||
|
.build(),
|
||||||
|
network_rx_errors: meter
|
||||||
|
.u64_gauge("system_network_rx_errors_per_sec")
|
||||||
|
.with_description("Receive errors per second")
|
||||||
|
.build(),
|
||||||
|
network_tx_errors: meter
|
||||||
|
.u64_gauge("system_network_tx_errors_per_sec")
|
||||||
|
.with_description("Transmit errors per second")
|
||||||
|
.build(),
|
||||||
disk_usage: meter
|
disk_usage: meter
|
||||||
.u64_gauge("system_disk_usage_bytes")
|
.u64_gauge("system_disk_usage_bytes")
|
||||||
.with_description("Disk usage in bytes")
|
.with_description("Disk usage in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
disk_total: meter
|
disk_total: meter
|
||||||
.u64_gauge("system_disk_total_bytes")
|
.u64_gauge("system_disk_total_bytes")
|
||||||
.with_description("Total disk space in bytes")
|
.with_description("Total disk space in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
process_cpu: meter
|
process_cpu: meter
|
||||||
.f64_gauge("system_process_cpu_usage_percent")
|
.f64_gauge("system_process_cpu_usage_percent")
|
||||||
.with_description("Process CPU usage percentage")
|
.with_description("Process CPU usage percentage")
|
||||||
.init(),
|
.build(),
|
||||||
process_memory: meter
|
process_memory: meter
|
||||||
.u64_gauge("system_process_memory_usage_bytes")
|
.u64_gauge("system_process_memory_usage_bytes")
|
||||||
.with_description("Process memory usage in bytes")
|
.with_description("Process memory usage in bytes")
|
||||||
.init(),
|
.build(),
|
||||||
temperature: meter
|
temperature: meter
|
||||||
.f64_gauge("system_temperature_celsius")
|
.f64_gauge("system_temperature_celsius")
|
||||||
.with_description("Temperature in Celsius")
|
.with_description("Temperature in Celsius")
|
||||||
.init(),
|
.build(),
|
||||||
|
disk_io_read_bytes: meter
|
||||||
|
.u64_gauge("system_disk_io_read_bytes_per_sec")
|
||||||
|
.with_description("Disk read bytes per second")
|
||||||
|
.build(),
|
||||||
|
disk_io_write_bytes: meter
|
||||||
|
.u64_gauge("system_disk_io_write_bytes_per_sec")
|
||||||
|
.with_description("Disk write bytes per second")
|
||||||
|
.build(),
|
||||||
|
disk_io_read_ops: meter
|
||||||
|
.u64_gauge("system_disk_io_read_ops_per_sec")
|
||||||
|
.with_description("Disk read operations per second")
|
||||||
|
.build(),
|
||||||
|
disk_io_write_ops: meter
|
||||||
|
.u64_gauge("system_disk_io_write_ops_per_sec")
|
||||||
|
.with_description("Disk write operations per second")
|
||||||
|
.build(),
|
||||||
|
load_avg_1: meter
|
||||||
|
.f64_gauge("system_load_average_1m")
|
||||||
|
.with_description("System load average over 1 minute")
|
||||||
|
.build(),
|
||||||
|
load_avg_5: meter
|
||||||
|
.f64_gauge("system_load_average_5m")
|
||||||
|
.with_description("System load average over 5 minutes")
|
||||||
|
.build(),
|
||||||
|
load_avg_15: meter
|
||||||
|
.f64_gauge("system_load_average_15m")
|
||||||
|
.with_description("System load average over 15 minutes")
|
||||||
|
.build(),
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
@@ -145,8 +200,12 @@ impl MetricsExporter {
|
|||||||
if let Some(network_metrics) = &metrics.network {
|
if let Some(network_metrics) = &metrics.network {
|
||||||
for net in network_metrics {
|
for net in network_metrics {
|
||||||
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
||||||
self.gauges.network_rx.add(net.rx_bytes_total, attrs);
|
self.gauges.network_rx.record(net.rx_bytes_per_sec, attrs);
|
||||||
self.gauges.network_tx.add(net.tx_bytes_total, attrs);
|
self.gauges.network_tx.record(net.tx_bytes_per_sec, attrs);
|
||||||
|
self.gauges.network_rx_packets.record(net.rx_packets_per_sec, attrs);
|
||||||
|
self.gauges.network_tx_packets.record(net.tx_packets_per_sec, attrs);
|
||||||
|
self.gauges.network_rx_errors.record(net.rx_errors_per_sec, attrs);
|
||||||
|
self.gauges.network_tx_errors.record(net.tx_errors_per_sec, attrs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -185,6 +244,24 @@ impl MetricsExporter {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Export load average metrics
|
||||||
|
if let Some(load_avg) = &metrics.load_avg {
|
||||||
|
self.gauges.load_avg_1.record(load_avg.load1, &[]);
|
||||||
|
self.gauges.load_avg_5.record(load_avg.load5, &[]);
|
||||||
|
self.gauges.load_avg_15.record(load_avg.load15, &[]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Export disk I/O metrics
|
||||||
|
if let Some(disk_io_metrics) = &metrics.disk_io {
|
||||||
|
for disk_io in disk_io_metrics {
|
||||||
|
let attrs = &[KeyValue::new("device", disk_io.device_name.clone())];
|
||||||
|
self.gauges.disk_io_read_bytes.record(disk_io.read_bytes_per_sec, attrs);
|
||||||
|
self.gauges.disk_io_write_bytes.record(disk_io.write_bytes_per_sec, attrs);
|
||||||
|
self.gauges.disk_io_read_ops.record(disk_io.read_ops_per_sec, attrs);
|
||||||
|
self.gauges.disk_io_write_ops.record(disk_io.write_ops_per_sec, attrs);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn shutdown(self) -> Result<()> {
|
pub async fn shutdown(self) -> Result<()> {
|
||||||
|
|||||||
27
symon.toml
27
symon.toml
@@ -33,10 +33,12 @@ environment = "production"
|
|||||||
[metrics]
|
[metrics]
|
||||||
cpu = true # CPU usage per core
|
cpu = true # CPU usage per core
|
||||||
memory = true # RAM and swap usage
|
memory = true # RAM and swap usage
|
||||||
network = true # Network RX/TX
|
network = true # Network RX/TX bytes, packets, errors
|
||||||
disk = true # Disk usage
|
disk = true # Disk usage
|
||||||
processes = false # Top 10 processes (disabled by default - can generate high cardinality)
|
processes = true # Top N processes by CPU (disabled by default - can generate high cardinality)
|
||||||
temperature = true # System temperatures (if available)
|
temperature = true # System temperatures (if available)
|
||||||
|
load_avg = true # System load average (1m, 5m, 15m)
|
||||||
|
disk_io = true # Disk I/O read/write bytes and operations (Linux only)
|
||||||
|
|
||||||
# Process filtering configuration
|
# Process filtering configuration
|
||||||
# Only used when processes = true
|
# Only used when processes = true
|
||||||
@@ -46,23 +48,16 @@ temperature = true # System temperatures (if available)
|
|||||||
|
|
||||||
# Option 2: Configure inline
|
# Option 2: Configure inline
|
||||||
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
||||||
filter_mode = "whitelist"
|
filter_mode = "blacklist"
|
||||||
|
|
||||||
|
# Maximum number of processes to report (sorted by CPU usage, default: 10)
|
||||||
|
max_processes = 5
|
||||||
|
|
||||||
# List of process names to filter (case-insensitive substring match)
|
# List of process names to filter (case-insensitive substring match)
|
||||||
names = [
|
names = [
|
||||||
# Web servers
|
# Exclude system processes that generate too much noise
|
||||||
"nginx",
|
# "kworker",
|
||||||
"apache",
|
# "systemd",
|
||||||
|
|
||||||
# Databases
|
|
||||||
"postgres",
|
|
||||||
"mysql",
|
|
||||||
"redis",
|
|
||||||
|
|
||||||
# Application servers
|
|
||||||
# "java",
|
|
||||||
# "node",
|
|
||||||
# "python",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# List of regex patterns to match process names (case-sensitive)
|
# List of regex patterns to match process names (case-sensitive)
|
||||||
|
|||||||
Reference in New Issue
Block a user