init
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
/target
|
||||
1841
Cargo.lock
generated
Normal file
1841
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
44
Cargo.toml
Normal file
44
Cargo.toml
Normal file
@@ -0,0 +1,44 @@
|
||||
[package]
|
||||
name = "symon"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
authors = ["Alex"]
|
||||
description = "Lightweight system metrics exporter for OpenTelemetry"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/battilo/symon"
|
||||
|
||||
[dependencies]
|
||||
# OpenTelemetry
|
||||
opentelemetry = { version = "0.26", features = ["metrics"] }
|
||||
opentelemetry-otlp = { version = "0.26", features = ["metrics", "grpc-tonic"] }
|
||||
opentelemetry_sdk = { version = "0.26", features = ["metrics", "rt-tokio"] }
|
||||
opentelemetry-semantic-conventions = "0.26"
|
||||
|
||||
# Async runtime
|
||||
tokio = { version = "1.48", features = ["rt-multi-thread", "macros", "sync", "time", "signal"] }
|
||||
tonic = "0.11"
|
||||
|
||||
# System metrics collection
|
||||
sysinfo = "0.31"
|
||||
|
||||
# Configuration
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
toml = "0.8"
|
||||
|
||||
# Logging and error handling
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
|
||||
# Process filtering
|
||||
regex = "1.11"
|
||||
|
||||
# CLI
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
|
||||
[profile.release]
|
||||
opt-level = 3
|
||||
lto = true
|
||||
codegen-units = 1
|
||||
strip = true
|
||||
227
docker-compose/METRICS.md
Normal file
227
docker-compose/METRICS.md
Normal file
@@ -0,0 +1,227 @@
|
||||
# Bottom OpenTelemetry Metrics Reference
|
||||
|
||||
This document lists all metrics exported by Bottom when running with the `opentelemetry` feature enabled.
|
||||
|
||||
## System Metrics
|
||||
|
||||
### CPU
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_cpu_usage_percent` | Gauge | `cpu_id` | CPU usage percentage per core |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Average CPU across all cores
|
||||
avg(system_cpu_usage_percent)
|
||||
|
||||
# CPU usage for core 0
|
||||
system_cpu_usage_percent{cpu_id="0"}
|
||||
```
|
||||
|
||||
### Memory
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_memory_usage_bytes` | Gauge | - | RAM memory currently in use |
|
||||
| `system_memory_total_bytes` | Gauge | - | Total RAM memory available |
|
||||
| `system_swap_usage_bytes` | Gauge | - | Swap memory currently in use |
|
||||
| `system_swap_total_bytes` | Gauge | - | Total swap memory available |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Memory usage percentage
|
||||
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||
|
||||
# Available memory
|
||||
system_memory_total_bytes - system_memory_usage_bytes
|
||||
```
|
||||
|
||||
### Network
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_network_rx_bytes_rate` | Gauge | `interface` | Network receive rate in bytes/sec |
|
||||
| `system_network_tx_bytes_rate` | Gauge | `interface` | Network transmit rate in bytes/sec |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Total network throughput
|
||||
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||
|
||||
# RX rate for specific interface
|
||||
system_network_rx_bytes_rate{interface="eth0"}
|
||||
```
|
||||
|
||||
### Disk
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_disk_usage_bytes` | Gauge | `device`, `mount` | Disk space currently in use |
|
||||
| `system_disk_total_bytes` | Gauge | `device`, `mount` | Total disk space available |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Disk usage percentage
|
||||
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||
|
||||
# Free disk space
|
||||
system_disk_total_bytes - system_disk_usage_bytes
|
||||
```
|
||||
|
||||
### Temperature
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_temperature_celsius` | Gauge | `sensor` | Temperature readings in Celsius |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Average temperature across all sensors
|
||||
avg(system_temperature_celsius)
|
||||
|
||||
# Maximum temperature
|
||||
max(system_temperature_celsius)
|
||||
```
|
||||
|
||||
## Process Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_process_cpu_usage_percent` | Gauge | `name`, `pid` | CPU usage percentage per process |
|
||||
| `system_process_memory_usage_bytes` | Gauge | `name`, `pid` | Memory usage in bytes per process |
|
||||
| `system_process_count` | Gauge | - | Total number of processes |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Top 10 processes by CPU
|
||||
topk(10, system_process_cpu_usage_percent)
|
||||
|
||||
# Top 10 processes by memory
|
||||
topk(10, system_process_memory_usage_bytes)
|
||||
|
||||
# Total memory used by all Chrome processes
|
||||
sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
|
||||
```
|
||||
|
||||
## Recording Rules
|
||||
|
||||
The following recording rules are pre-configured in Prometheus (see `rules/bottom_rules.yml`):
|
||||
|
||||
| Rule Name | Expression | Description |
|
||||
|-----------|------------|-------------|
|
||||
| `system_process_cpu_usage_percent:recent` | Recent process CPU metrics | Filters out stale process data (>2 min old) |
|
||||
| `system_process_memory_usage_bytes:recent` | Recent process memory metrics | Filters out stale process data (>2 min old) |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Query only recent process data
|
||||
topk(10, system_process_cpu_usage_percent:recent)
|
||||
```
|
||||
|
||||
## Common Queries
|
||||
|
||||
### System Health
|
||||
|
||||
```promql
|
||||
# Overall system CPU usage
|
||||
avg(system_cpu_usage_percent)
|
||||
|
||||
# Memory pressure (>80% is high)
|
||||
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||
|
||||
# Disk pressure (>90% is critical)
|
||||
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||
```
|
||||
|
||||
### Resource Hogs
|
||||
|
||||
```promql
|
||||
# Top CPU consumers
|
||||
topk(5, system_process_cpu_usage_percent)
|
||||
|
||||
# Top memory consumers
|
||||
topk(5, system_process_memory_usage_bytes)
|
||||
|
||||
# Processes using >1GB memory
|
||||
system_process_memory_usage_bytes > 1073741824
|
||||
```
|
||||
|
||||
### Network Analysis
|
||||
|
||||
```promql
|
||||
# Total network traffic (RX + TX)
|
||||
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||
|
||||
# Network traffic by interface
|
||||
sum by (interface) (system_network_rx_bytes_rate + system_network_tx_bytes_rate)
|
||||
|
||||
# Interfaces with high RX rate (>10MB/s)
|
||||
system_network_rx_bytes_rate > 10485760
|
||||
```
|
||||
|
||||
## Alerting Examples
|
||||
|
||||
### Sample Prometheus Alert Rules
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: bottom_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: avg(system_cpu_usage_percent) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage detected"
|
||||
description: "Average CPU usage is {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (system_memory_usage_bytes / system_memory_total_bytes) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage detected"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- alert: DiskAlmostFull
|
||||
expr: (system_disk_usage_bytes / system_disk_total_bytes) * 100 > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk {{ $labels.mount }} almost full"
|
||||
description: "Disk usage is {{ $value }}% on {{ $labels.mount }}"
|
||||
```
|
||||
|
||||
## Label Reference
|
||||
|
||||
| Label | Used In | Description |
|
||||
|-------|---------|-------------|
|
||||
| `cpu_id` | CPU metrics | CPU core identifier (0, 1, 2, ...) |
|
||||
| `interface` | Network metrics | Network interface name (eth0, wlan0, ...) |
|
||||
| `device` | Disk metrics | Device name (/dev/sda1, ...) |
|
||||
| `mount` | Disk metrics | Mount point (/, /home, ...) |
|
||||
| `sensor` | Temperature | Temperature sensor name |
|
||||
| `name` | Process metrics | Process name |
|
||||
| `pid` | Process metrics | Process ID |
|
||||
| `exported_job` | All | Always "bottom-system-monitor" |
|
||||
| `otel_scope_name` | All | Always "bottom-system-monitor" |
|
||||
|
||||
## Data Retention
|
||||
|
||||
By default, Prometheus stores metrics for 15 days. You can adjust this in the Prometheus configuration:
|
||||
|
||||
```yaml
|
||||
# In prometheus.yml
|
||||
global:
|
||||
retention_time: 30d # Keep data for 30 days
|
||||
```
|
||||
|
||||
For long-term storage, consider using:
|
||||
- **TimescaleDB** (see `docker-compose-timescale.yml.ko`)
|
||||
- **Thanos** for multi-cluster metrics
|
||||
- **Cortex** for horizontally scalable storage
|
||||
195
docker-compose/README.md
Normal file
195
docker-compose/README.md
Normal file
@@ -0,0 +1,195 @@
|
||||
# Bottom OpenTelemetry Docker Compose Setup
|
||||
|
||||
This directory contains a Docker Compose setup for running an observability stack to monitor Bottom with OpenTelemetry.
|
||||
|
||||
## Architecture
|
||||
|
||||
The stack includes:
|
||||
|
||||
1. **OpenTelemetry Collector** - Receives metrics from Bottom via OTLP protocol
|
||||
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
||||
3. **Grafana** - Visualizes metrics from Prometheus
|
||||
|
||||
```
|
||||
Bottom (with --headless flag)
|
||||
↓ (OTLP/gRPC on port 4317)
|
||||
OpenTelemetry Collector
|
||||
↓ (Prometheus scrape on port 8889)
|
||||
Prometheus
|
||||
↓ (Query on port 9090)
|
||||
Grafana (accessible on port 3000)
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start the observability stack
|
||||
|
||||
```bash
|
||||
cd docker-compose
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
This will start:
|
||||
- OpenTelemetry Collector on ports 4317 (gRPC), 4318 (HTTP), 8889 (metrics)
|
||||
- Prometheus on port 9090
|
||||
- Grafana on port 3000
|
||||
|
||||
### 2. Build Bottom with OpenTelemetry support
|
||||
|
||||
```bash
|
||||
cd ..
|
||||
cargo build --release --features opentelemetry
|
||||
```
|
||||
|
||||
### 3. Create a configuration file
|
||||
|
||||
Create a `bottom-config.toml` file:
|
||||
|
||||
```toml
|
||||
[opentelemetry]
|
||||
enabled = true
|
||||
endpoint = "http://localhost:4317"
|
||||
service_name = "bottom-system-monitor"
|
||||
export_interval_ms = 5000
|
||||
|
||||
[opentelemetry.metrics]
|
||||
cpu = true
|
||||
memory = true
|
||||
network = true
|
||||
disk = true
|
||||
processes = true
|
||||
temperature = true
|
||||
gpu = true
|
||||
```
|
||||
|
||||
### 4. Run Bottom in headless mode
|
||||
|
||||
```bash
|
||||
./target/release/btm --config bottom-config.toml --headless
|
||||
```
|
||||
|
||||
Or without config file:
|
||||
|
||||
```bash
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
|
||||
./target/release/btm --headless
|
||||
```
|
||||
|
||||
### 5. Access the dashboards
|
||||
|
||||
- **Prometheus**: http://localhost:9090
|
||||
- **Grafana**: http://localhost:3000 (username: `admin`, password: `admin`)
|
||||
|
||||
## Configuration Files
|
||||
|
||||
### otel-collector-config.yml
|
||||
|
||||
Configures the OpenTelemetry Collector to:
|
||||
- Receive OTLP data on ports 4317 (gRPC) and 4318 (HTTP)
|
||||
- Export metrics in Prometheus format on port 9090
|
||||
- Debug log all received data
|
||||
|
||||
### prometheus.yml
|
||||
|
||||
Configures Prometheus to:
|
||||
- Scrape metrics from the OTEL Collector every 10 seconds
|
||||
- Load alerting rules from `rules/bottom_rules.yml`
|
||||
|
||||
### rules/bottom_rules.yml
|
||||
|
||||
Contains Prometheus recording rules for Bottom metrics, including:
|
||||
- Recent process CPU usage metrics
|
||||
- Recent process memory usage metrics
|
||||
|
||||
## Viewing Metrics in Prometheus
|
||||
|
||||
1. Go to http://localhost:9090
|
||||
2. Click on "Graph"
|
||||
3. Try these example queries:
|
||||
|
||||
```promql
|
||||
# CPU usage by core
|
||||
system_cpu_usage_percent
|
||||
|
||||
# Memory usage
|
||||
system_memory_usage_bytes
|
||||
|
||||
# Network RX/TX
|
||||
system_network_rx_bytes
|
||||
system_network_tx_bytes
|
||||
|
||||
# Disk usage
|
||||
system_disk_usage_bytes
|
||||
|
||||
# Top processes by CPU
|
||||
topk(10, system_process_cpu_usage_percent)
|
||||
|
||||
# Top processes by memory
|
||||
topk(10, system_process_memory_usage_bytes)
|
||||
```
|
||||
|
||||
## Grafana Configuration
|
||||
|
||||
Grafana is automatically configured with:
|
||||
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
||||
- **Bottom System Overview dashboard** - pre-loaded
|
||||
|
||||
To access:
|
||||
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
||||
2. Navigate to Dashboards → Browse → "Bottom System Overview"
|
||||
|
||||
The dashboard includes:
|
||||
- CPU usage by core
|
||||
- Memory usage (RAM/Swap)
|
||||
- Network traffic
|
||||
- Disk usage
|
||||
- Top 10 processes by CPU
|
||||
- Top 10 processes by Memory
|
||||
|
||||
## Stopping the Stack
|
||||
|
||||
```bash
|
||||
docker-compose down
|
||||
```
|
||||
|
||||
To also remove volumes:
|
||||
|
||||
```bash
|
||||
docker-compose down -v
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Bottom not sending metrics
|
||||
|
||||
Check the OTEL Collector logs:
|
||||
```bash
|
||||
docker-compose logs -f otel-collector
|
||||
```
|
||||
|
||||
You should see messages about receiving metrics.
|
||||
|
||||
### Prometheus not scraping
|
||||
|
||||
1. Check Prometheus targets at http://localhost:9090/targets
|
||||
2. The `otel-collector` target should be UP
|
||||
|
||||
### No data in Grafana
|
||||
|
||||
1. Verify Prometheus data source is configured correctly
|
||||
2. Check that Prometheus has data by querying directly
|
||||
3. Ensure your time range in Grafana includes when Bottom was running
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Using with TimescaleDB (optional)
|
||||
|
||||
A TimescaleDB configuration file is available as `docker-compose-timescale.yml.ko` for long-term storage of metrics. Rename it to include it in your stack.
|
||||
|
||||
### Custom Prometheus Rules
|
||||
|
||||
Edit `rules/bottom_rules.yml` to add custom recording or alerting rules.
|
||||
|
||||
### OTEL Collector Sampling
|
||||
|
||||
Edit `otel-collector-config.yml` to adjust the batch processor settings for different performance characteristics.
|
||||
61
docker-compose/docker-compose-timescale.yml.ko
Normal file
61
docker-compose/docker-compose-timescale.yml.ko
Normal file
@@ -0,0 +1,61 @@
|
||||
services:
|
||||
timescaledb:
|
||||
image: timescale/timescaledb-ha:pg15
|
||||
environment:
|
||||
POSTGRES_PASSWORD: password
|
||||
POSTGRES_DB: promscale
|
||||
POSTGRES_USER: postgres
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- timescale_data:/var/lib/postgresql/data
|
||||
|
||||
promscale:
|
||||
image: timescale/promscale:latest
|
||||
ports:
|
||||
- "9201:9201"
|
||||
depends_on:
|
||||
- timescaledb
|
||||
environment:
|
||||
PROMSCALE_DB_URI: postgres://postgres:password@timescaledb:5432/promscale?sslmode=disable
|
||||
PROMSCALE_STARTUP_INSTALL_EXTENSIONS: "true"
|
||||
restart: on-failure
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
container_name: otel-collector
|
||||
command: ["--config=/etc/otel-collector-config.yml"]
|
||||
volumes:
|
||||
- ./otel-collector-config.yml:/etc/otel-collector-config.yml
|
||||
|
||||
ports:
|
||||
- "4317:4317"
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./rules:/etc/prometheus/rules
|
||||
ports:
|
||||
- "9090:9090" # Interfaccia Web di Prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
depends_on:
|
||||
- otel-collector
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
volumes:
|
||||
- grafana-storage:/var/lib/grafana
|
||||
depends_on:
|
||||
- prometheus
|
||||
|
||||
volumes:
|
||||
grafana-storage:
|
||||
timescale_data:
|
||||
52
docker-compose/docker-compose.yml
Normal file
52
docker-compose/docker-compose.yml
Normal file
@@ -0,0 +1,52 @@
|
||||
services:
|
||||
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:latest
|
||||
container_name: otel-collector
|
||||
command: ["--config=/etc/otel-collector-config.yml"]
|
||||
volumes:
|
||||
- ./otel-collector-config.yml:/etc/otel-collector-config.yml
|
||||
ports:
|
||||
- "4317:4317" # gRPC
|
||||
- "4318:4318" # HTTP
|
||||
- "8889:8889" # Prometheus metrics endpoint
|
||||
networks:
|
||||
- observ-net
|
||||
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./rules:/etc/prometheus/rules
|
||||
ports:
|
||||
- "9090:9090" # Interfaccia Web di Prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
depends_on:
|
||||
- otel-collector
|
||||
networks:
|
||||
- observ-net
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
volumes:
|
||||
- grafana-storage:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- observ-net
|
||||
|
||||
volumes:
|
||||
grafana-storage:
|
||||
|
||||
networks:
|
||||
observ-net:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,278 @@
|
||||
{
|
||||
"title": "Bottom System Overview",
|
||||
"uid": "bottom-overview",
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 16,
|
||||
"refresh": "5s",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "CPU Usage by Core",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "system_cpu_usage_percent",
|
||||
"legendFormat": "Core {{cpu_id}}",
|
||||
"refId": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "system_memory_usage_bytes",
|
||||
"legendFormat": "RAM Used",
|
||||
"refId": "RAM"
|
||||
},
|
||||
{
|
||||
"expr": "system_memory_total_bytes",
|
||||
"legendFormat": "RAM Total",
|
||||
"refId": "RAM_Total"
|
||||
},
|
||||
{
|
||||
"expr": "system_swap_usage_bytes",
|
||||
"legendFormat": "Swap Used",
|
||||
"refId": "Swap"
|
||||
},
|
||||
{
|
||||
"expr": "system_swap_total_bytes",
|
||||
"legendFormat": "Swap Total",
|
||||
"refId": "Swap_Total"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "bytes"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Network Traffic",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "system_network_rx_bytes_rate",
|
||||
"legendFormat": "RX - {{interface}}",
|
||||
"refId": "RX"
|
||||
},
|
||||
{
|
||||
"expr": "system_network_tx_bytes_rate",
|
||||
"legendFormat": "TX - {{interface}}",
|
||||
"refId": "TX"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "Bps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Disk Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "(system_disk_usage_bytes / system_disk_total_bytes) * 100",
|
||||
"legendFormat": "{{mount}} ({{device}})",
|
||||
"refId": "Disk"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100,
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 70, "color": "yellow"},
|
||||
{"value": 90, "color": "red"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Top 10 Processes by CPU",
|
||||
"type": "table",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, system_process_cpu_usage_percent and (time() - timestamp(system_process_cpu_usage_percent) < 30))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Process"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"__name__": true,
|
||||
"job": true,
|
||||
"instance": true,
|
||||
"exported_job": true,
|
||||
"otel_scope_name": true
|
||||
},
|
||||
"indexByName": {
|
||||
"name": 0,
|
||||
"pid": 1,
|
||||
"Value": 2
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Process Name",
|
||||
"pid": "PID",
|
||||
"Value": "CPU %"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "CPU %",
|
||||
"desc": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "CPU %"},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "percent"
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 50, "color": "yellow"},
|
||||
{"value": 80, "color": "red"}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Top 10 Processes by Memory",
|
||||
"type": "table",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, system_process_memory_usage_bytes and (time() - timestamp(system_process_memory_usage_bytes) < 30))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"refId": "Process"
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {
|
||||
"Time": true,
|
||||
"__name__": true,
|
||||
"job": true,
|
||||
"instance": true,
|
||||
"exported_job": true,
|
||||
"otel_scope_name": true
|
||||
},
|
||||
"indexByName": {
|
||||
"name": 0,
|
||||
"pid": 1,
|
||||
"Value": 2
|
||||
},
|
||||
"renameByName": {
|
||||
"name": "Process Name",
|
||||
"pid": "PID",
|
||||
"Value": "Memory"
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showHeader": true,
|
||||
"sortBy": [
|
||||
{
|
||||
"displayName": "Memory",
|
||||
"desc": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"displayMode": "auto"
|
||||
}
|
||||
},
|
||||
"overrides": [
|
||||
{
|
||||
"matcher": {"id": "byName", "options": "Memory"},
|
||||
"properties": [
|
||||
{
|
||||
"id": "unit",
|
||||
"value": "bytes"
|
||||
},
|
||||
{
|
||||
"id": "custom.displayMode",
|
||||
"value": "color-background"
|
||||
},
|
||||
{
|
||||
"id": "thresholds",
|
||||
"value": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"value": 0, "color": "green"},
|
||||
{"value": 1073741824, "color": "yellow"},
|
||||
{"value": 2147483648, "color": "red"}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'Bottom Dashboards'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
updateIntervalSeconds: 10
|
||||
allowUiUpdates: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: 10s
|
||||
queryTimeout: 60s
|
||||
31
docker-compose/otel-collector-config.yml
Normal file
31
docker-compose/otel-collector-config.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
send_batch_size: 10000
|
||||
timeout: 10s
|
||||
metricsgeneration: {}
|
||||
|
||||
exporters:
|
||||
prometheus:
|
||||
endpoint: "0.0.0.0:8889"
|
||||
debug:
|
||||
verbosity: detailed
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
metrics:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [prometheus, debug]
|
||||
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [batch]
|
||||
exporters: [debug]
|
||||
67
docker-compose/processes-example.toml
Normal file
67
docker-compose/processes-example.toml
Normal file
@@ -0,0 +1,67 @@
|
||||
# Example process filter configuration file
|
||||
# This file can be included from the main bottom config to keep
|
||||
# server-specific process lists separate.
|
||||
#
|
||||
# Usage in bottom-config.toml:
|
||||
# [opentelemetry.metrics.process_filter]
|
||||
# include = "processes.toml"
|
||||
|
||||
# Filter mode: "whitelist" or "blacklist"
|
||||
# - whitelist: Only export metrics for processes in the lists below
|
||||
# - blacklist: Export metrics for all processes EXCEPT those in the lists
|
||||
filter_mode = "whitelist"
|
||||
|
||||
# Process names to monitor (case-insensitive substring match)
|
||||
# Examples for common server processes:
|
||||
names = [
|
||||
# Web servers
|
||||
"nginx",
|
||||
"apache",
|
||||
"httpd",
|
||||
|
||||
# Databases
|
||||
"postgres",
|
||||
"mysql",
|
||||
"redis",
|
||||
"mongodb",
|
||||
|
||||
# Application servers
|
||||
"java",
|
||||
"node",
|
||||
"python",
|
||||
|
||||
# Your custom applications
|
||||
# "myapp",
|
||||
]
|
||||
|
||||
# Regex patterns to match process names (case-sensitive)
|
||||
# More powerful than simple substring matching
|
||||
patterns = [
|
||||
# Match specific versions
|
||||
# "^nginx-[0-9.]+$",
|
||||
# "^node-v[0-9]+",
|
||||
|
||||
# Match Java applications with specific main class
|
||||
# "java.*MyApplication",
|
||||
|
||||
# Match processes with specific format
|
||||
# "^gunicorn: worker",
|
||||
|
||||
# Match kernel threads (for blacklist)
|
||||
# "^\\[.*\\]$",
|
||||
]
|
||||
|
||||
# Specific process PIDs to monitor (optional)
|
||||
# Useful for monitoring specific long-running processes
|
||||
pids = []
|
||||
|
||||
# Example blacklist configuration:
|
||||
# filter_mode = "blacklist"
|
||||
# names = [
|
||||
# "systemd", # Exclude system processes
|
||||
# "kworker",
|
||||
# "migration",
|
||||
# ]
|
||||
# patterns = [
|
||||
# "^\\[.*\\]$", # Exclude all kernel threads
|
||||
# ]
|
||||
21
docker-compose/prometheus.yml
Normal file
21
docker-compose/prometheus.yml
Normal file
@@ -0,0 +1,21 @@
|
||||
global:
|
||||
scrape_interval: 10s # Quanto spesso fare lo scraping
|
||||
evaluation_interval: 10s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
# Job 1: Monitora se Prometheus stesso è attivo
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Job 2: Scrape dell'OpenTelemetry Collector
|
||||
- job_name: 'otel-collector'
|
||||
# Il Collector espone le metriche per lo scraping sulla sua porta 8889
|
||||
metrics_path: '/metrics'
|
||||
static_configs:
|
||||
# Raggiunge il Collector usando il suo nome di servizio Docker
|
||||
- targets: ['otel-collector:8889']
|
||||
|
||||
15
docker-compose/rules/bottom_rules.yml
Normal file
15
docker-compose/rules/bottom_rules.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
groups:
|
||||
- name: bottom_process_metrics
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: system_process_cpu_usage_percent:recent
|
||||
expr: |
|
||||
system_process_cpu_usage_percent
|
||||
and on(pid, name)
|
||||
(time() - timestamp(system_process_cpu_usage_percent) < 120)
|
||||
|
||||
- record: system_process_memory_usage_bytes:recent
|
||||
expr: |
|
||||
system_process_memory_usage_bytes
|
||||
and on(pid, name)
|
||||
(time() - timestamp(system_process_memory_usage_bytes) < 120)
|
||||
61
docker-compose/symon-config-example.toml
Normal file
61
docker-compose/symon-config-example.toml
Normal file
@@ -0,0 +1,61 @@
|
||||
# Example Symon configuration file for OpenTelemetry export
|
||||
# Copy this file and customize it for your needs
|
||||
|
||||
# Collection interval in seconds
|
||||
collection_interval_secs = 5
|
||||
|
||||
# OTLP configuration
|
||||
[otlp]
|
||||
# OTLP endpoint (gRPC)
|
||||
# For local docker-compose setup: http://localhost:4317
|
||||
# For remote collector: http://your-collector-host:4317
|
||||
endpoint = "http://localhost:4317"
|
||||
|
||||
# Export interval in seconds
|
||||
export_interval_secs = 10
|
||||
|
||||
# Service name that will appear in metrics
|
||||
service_name = "symon"
|
||||
|
||||
# Service version
|
||||
service_version = "0.1.0"
|
||||
|
||||
# Export timeout in seconds
|
||||
export_timeout_secs = 30
|
||||
|
||||
# Additional resource attributes (key-value pairs)
|
||||
[otlp.resource_attributes]
|
||||
environment = "production"
|
||||
host = "server-01"
|
||||
|
||||
# Metrics configuration - enable/disable specific metric types
|
||||
[metrics]
|
||||
cpu = true # CPU usage per core and average
|
||||
memory = true # RAM, swap usage
|
||||
network = true # Network RX/TX
|
||||
disk = true # Disk usage
|
||||
temperature = true # CPU/GPU temperatures
|
||||
processes = true # Top 10 processes by CPU/Memory
|
||||
|
||||
# Process filtering configuration
|
||||
[metrics.process_filter]
|
||||
# Option 1: Use an external file for server-specific process lists
|
||||
# This allows different servers to monitor different processes
|
||||
# Path can be relative to this config file or absolute
|
||||
#include = "processes.toml"
|
||||
|
||||
# Option 2: Configure inline
|
||||
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
||||
filter_mode = "whitelist"
|
||||
|
||||
# List of process names to filter (case-insensitive substring match)
|
||||
# Examples: ["nginx", "postgres", "redis", "myapp"]
|
||||
names = ["nginx", "postgres", "redis"]
|
||||
|
||||
# List of regex patterns to match process names (case-sensitive)
|
||||
# More powerful than substring matching
|
||||
# Examples: ["^nginx-[0-9.]+$", "java.*MyApp", "^gunicorn: worker"]
|
||||
patterns = []
|
||||
|
||||
# List of specific process PIDs to filter
|
||||
pids = []
|
||||
80
docker-compose/test-stack.sh
Executable file
80
docker-compose/test-stack.sh
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# Test script to verify the observability stack is running correctly
|
||||
|
||||
set -e
|
||||
|
||||
echo "🔍 Testing Bottom OpenTelemetry Stack..."
|
||||
echo ""
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Test OTEL Collector gRPC endpoint
|
||||
echo -n "Testing OTEL Collector gRPC (port 4317)... "
|
||||
if nc -zv localhost 4317 2>&1 | grep -q "succeeded\|open"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test OTEL Collector HTTP endpoint
|
||||
echo -n "Testing OTEL Collector HTTP (port 4318)... "
|
||||
if nc -zv localhost 4318 2>&1 | grep -q "succeeded\|open"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test OTEL Collector metrics endpoint
|
||||
echo -n "Testing OTEL Collector metrics (port 8889)... "
|
||||
if curl -s http://localhost:8889/metrics > /dev/null; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test Prometheus
|
||||
echo -n "Testing Prometheus (port 9090)... "
|
||||
if curl -s http://localhost:9090/-/healthy | grep -q "Prometheus"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test Prometheus targets
|
||||
echo -n "Testing Prometheus targets... "
|
||||
TARGETS=$(curl -s http://localhost:9090/api/v1/targets | grep -o '"health":"up"' | wc -l)
|
||||
if [ "$TARGETS" -gt 0 ]; then
|
||||
echo -e "${GREEN}✓ OK${NC} (${TARGETS} targets up)"
|
||||
else
|
||||
echo -e "${YELLOW}⚠ WARNING${NC} (no targets up yet - this is normal if just started)"
|
||||
fi
|
||||
|
||||
# Test Grafana
|
||||
echo -n "Testing Grafana (port 3000)... "
|
||||
if curl -s http://localhost:3000/api/health | grep -q "ok"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}✓ All tests passed!${NC}"
|
||||
echo ""
|
||||
echo "📊 Access points:"
|
||||
echo " - Prometheus: http://localhost:9090"
|
||||
echo " - Grafana: http://localhost:3000 (admin/admin)"
|
||||
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
||||
echo ""
|
||||
echo "💡 Next steps:"
|
||||
echo " 1. Build bottom with: cargo build --release --features opentelemetry"
|
||||
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
||||
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
||||
214
src/collector.rs
Normal file
214
src/collector.rs
Normal file
@@ -0,0 +1,214 @@
|
||||
use crate::config::MetricsConfig;
|
||||
use anyhow::Result;
|
||||
use sysinfo::{CpuRefreshKind, Disks, Networks, RefreshKind, System};
|
||||
|
||||
/// System metrics collected at a point in time
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SystemMetrics {
|
||||
pub cpu: Option<Vec<CpuMetric>>,
|
||||
pub memory: Option<MemoryMetric>,
|
||||
pub network: Option<Vec<NetworkMetric>>,
|
||||
pub disk: Option<Vec<DiskMetric>>,
|
||||
pub processes: Option<Vec<ProcessMetric>>,
|
||||
pub temperature: Option<Vec<TemperatureMetric>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CpuMetric {
|
||||
pub core_index: usize,
|
||||
pub usage_percent: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MemoryMetric {
|
||||
pub used_bytes: u64,
|
||||
pub total_bytes: u64,
|
||||
pub swap_used_bytes: u64,
|
||||
pub swap_total_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NetworkMetric {
|
||||
pub interface_name: String,
|
||||
pub rx_bytes_total: u64,
|
||||
pub tx_bytes_total: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiskMetric {
|
||||
pub device_name: String,
|
||||
pub mount_point: String,
|
||||
pub used_bytes: u64,
|
||||
pub total_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProcessMetric {
|
||||
pub pid: u32,
|
||||
pub name: String,
|
||||
pub cpu_usage_percent: f32,
|
||||
pub memory_bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TemperatureMetric {
|
||||
pub sensor_name: String,
|
||||
pub temperature_celsius: f32,
|
||||
}
|
||||
|
||||
/// Collector for system metrics
|
||||
pub struct MetricsCollector {
|
||||
system: System,
|
||||
networks: Networks,
|
||||
disks: Disks,
|
||||
config: MetricsConfig,
|
||||
}
|
||||
|
||||
impl MetricsCollector {
|
||||
pub fn new(config: MetricsConfig) -> Self {
|
||||
let refresh_kind = RefreshKind::new()
|
||||
.with_cpu(CpuRefreshKind::everything())
|
||||
.with_memory(sysinfo::MemoryRefreshKind::everything())
|
||||
.with_processes(sysinfo::ProcessRefreshKind::everything());
|
||||
|
||||
Self {
|
||||
system: System::new_with_specifics(refresh_kind),
|
||||
networks: Networks::new_with_refreshed_list(),
|
||||
disks: Disks::new_with_refreshed_list(),
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect all enabled metrics
|
||||
pub fn collect(&mut self) -> Result<SystemMetrics> {
|
||||
// Refresh system info
|
||||
self.system.refresh_all();
|
||||
self.networks.refresh();
|
||||
|
||||
Ok(SystemMetrics {
|
||||
cpu: if self.config.cpu {
|
||||
Some(self.collect_cpu())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
memory: if self.config.memory {
|
||||
Some(self.collect_memory())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
network: if self.config.network {
|
||||
Some(self.collect_network())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
disk: if self.config.disk {
|
||||
Some(self.collect_disk())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
processes: if self.config.processes {
|
||||
Some(self.collect_processes())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
temperature: if self.config.temperature {
|
||||
Some(self.collect_temperature())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
fn collect_cpu(&self) -> Vec<CpuMetric> {
|
||||
self.system
|
||||
.cpus()
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(index, cpu)| CpuMetric {
|
||||
core_index: index,
|
||||
usage_percent: cpu.cpu_usage(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn collect_memory(&self) -> MemoryMetric {
|
||||
MemoryMetric {
|
||||
used_bytes: self.system.used_memory(),
|
||||
total_bytes: self.system.total_memory(),
|
||||
swap_used_bytes: self.system.used_swap(),
|
||||
swap_total_bytes: self.system.total_swap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_network(&self) -> Vec<NetworkMetric> {
|
||||
self.networks
|
||||
.iter()
|
||||
.map(|(interface_name, data)| NetworkMetric {
|
||||
interface_name: interface_name.to_string(),
|
||||
rx_bytes_total: data.total_received(),
|
||||
tx_bytes_total: data.total_transmitted(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn collect_disk(&self) -> Vec<DiskMetric> {
|
||||
self.disks
|
||||
.iter()
|
||||
.filter_map(|disk| {
|
||||
let total_bytes = disk.total_space();
|
||||
let available_bytes = disk.available_space();
|
||||
let used_bytes = total_bytes.saturating_sub(available_bytes);
|
||||
|
||||
Some(DiskMetric {
|
||||
device_name: disk.name().to_string_lossy().to_string(),
|
||||
mount_point: disk.mount_point().to_string_lossy().to_string(),
|
||||
used_bytes,
|
||||
total_bytes,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn collect_processes(&self) -> Vec<ProcessMetric> {
|
||||
let filter = self.config.process_filter.as_ref();
|
||||
|
||||
let mut processes: Vec<ProcessMetric> = self
|
||||
.system
|
||||
.processes()
|
||||
.iter()
|
||||
.filter(|(_, process)| {
|
||||
if let Some(filter_config) = filter {
|
||||
filter_config.should_include_process(
|
||||
process.name().to_string_lossy().as_ref(),
|
||||
process.pid().as_u32(),
|
||||
)
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.map(|(_, process)| ProcessMetric {
|
||||
pid: process.pid().as_u32(),
|
||||
name: process.name().to_string_lossy().to_string(),
|
||||
cpu_usage_percent: process.cpu_usage(),
|
||||
memory_bytes: process.memory(),
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Sort by CPU usage and limit to top 10
|
||||
processes.sort_by(|a, b| {
|
||||
b.cpu_usage_percent
|
||||
.partial_cmp(&a.cpu_usage_percent)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
processes.truncate(10);
|
||||
|
||||
processes
|
||||
}
|
||||
|
||||
fn collect_temperature(&self) -> Vec<TemperatureMetric> {
|
||||
// sysinfo doesn't have direct temperature support in 0.31
|
||||
// This would require platform-specific implementation or additional crates
|
||||
// For now, return empty vector
|
||||
vec![]
|
||||
}
|
||||
}
|
||||
347
src/config.rs
Normal file
347
src/config.rs
Normal file
@@ -0,0 +1,347 @@
|
||||
use anyhow::{Context, Result};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Duration;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct Config {
|
||||
/// OpenTelemetry export configuration
|
||||
#[serde(default)]
|
||||
pub otlp: OtlpConfig,
|
||||
|
||||
/// Metrics collection configuration
|
||||
#[serde(default)]
|
||||
pub metrics: MetricsConfig,
|
||||
|
||||
/// Collection interval
|
||||
#[serde(default = "default_collection_interval")]
|
||||
pub collection_interval_secs: u64,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
otlp: OtlpConfig::default(),
|
||||
metrics: MetricsConfig::default(),
|
||||
collection_interval_secs: default_collection_interval(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Load configuration from file
|
||||
pub fn from_file(path: &Path) -> Result<Self> {
|
||||
let content = std::fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to read config file: {}", path.display()))?;
|
||||
|
||||
let mut config: Config = toml::from_str(&content)
|
||||
.with_context(|| format!("Failed to parse config file: {}", path.display()))?;
|
||||
|
||||
// Load process filter includes if configured
|
||||
if let Some(process_filter) = &config.metrics.process_filter {
|
||||
let config_dir = path.parent();
|
||||
match process_filter.load_with_includes(config_dir) {
|
||||
Ok(loaded_filter) => {
|
||||
config.metrics.process_filter = Some(loaded_filter);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to load process filter include: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
config.validate()?;
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
/// Validate configuration
|
||||
pub fn validate(&self) -> Result<()> {
|
||||
if self.collection_interval_secs == 0 {
|
||||
anyhow::bail!("Collection interval must be greater than 0");
|
||||
}
|
||||
|
||||
self.otlp.validate()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn collection_interval(&self) -> Duration {
|
||||
Duration::from_secs(self.collection_interval_secs)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OtlpConfig {
|
||||
/// OTLP endpoint (e.g., "http://localhost:4317")
|
||||
#[serde(default = "default_endpoint")]
|
||||
pub endpoint: String,
|
||||
|
||||
/// Export interval in seconds
|
||||
#[serde(default = "default_export_interval")]
|
||||
pub export_interval_secs: u64,
|
||||
|
||||
/// Service name for the metrics
|
||||
#[serde(default = "default_service_name")]
|
||||
pub service_name: String,
|
||||
|
||||
/// Service version
|
||||
#[serde(default = "default_service_version")]
|
||||
pub service_version: String,
|
||||
|
||||
/// Additional resource attributes
|
||||
#[serde(default)]
|
||||
pub resource_attributes: std::collections::HashMap<String, String>,
|
||||
|
||||
/// Timeout for export operations in seconds
|
||||
#[serde(default = "default_timeout")]
|
||||
pub export_timeout_secs: u64,
|
||||
}
|
||||
|
||||
impl Default for OtlpConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
endpoint: default_endpoint(),
|
||||
export_interval_secs: default_export_interval(),
|
||||
service_name: default_service_name(),
|
||||
service_version: default_service_version(),
|
||||
resource_attributes: std::collections::HashMap::new(),
|
||||
export_timeout_secs: default_timeout(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OtlpConfig {
|
||||
pub fn export_interval(&self) -> Duration {
|
||||
Duration::from_secs(self.export_interval_secs)
|
||||
}
|
||||
|
||||
pub fn export_timeout(&self) -> Duration {
|
||||
Duration::from_secs(self.export_timeout_secs)
|
||||
}
|
||||
|
||||
pub fn validate(&self) -> Result<()> {
|
||||
if self.endpoint.is_empty() {
|
||||
anyhow::bail!("OTLP endpoint cannot be empty");
|
||||
}
|
||||
|
||||
if !self.endpoint.starts_with("http://") && !self.endpoint.starts_with("https://") {
|
||||
anyhow::bail!("OTLP endpoint must be a valid HTTP/HTTPS URL");
|
||||
}
|
||||
|
||||
if self.export_interval_secs == 0 {
|
||||
anyhow::bail!("Export interval must be greater than 0");
|
||||
}
|
||||
|
||||
if self.service_name.is_empty() {
|
||||
anyhow::bail!("Service name cannot be empty");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct MetricsConfig {
|
||||
/// Export CPU metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub cpu: bool,
|
||||
|
||||
/// Export memory metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub memory: bool,
|
||||
|
||||
/// Export network metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub network: bool,
|
||||
|
||||
/// Export disk metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub disk: bool,
|
||||
|
||||
/// Export process metrics
|
||||
#[serde(default)]
|
||||
pub processes: bool,
|
||||
|
||||
/// Export temperature metrics
|
||||
#[serde(default = "default_true")]
|
||||
pub temperature: bool,
|
||||
|
||||
/// Process filter configuration
|
||||
#[serde(default)]
|
||||
pub process_filter: Option<ProcessFilterConfig>,
|
||||
}
|
||||
|
||||
impl Default for MetricsConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
cpu: true,
|
||||
memory: true,
|
||||
network: true,
|
||||
disk: true,
|
||||
processes: false,
|
||||
temperature: true,
|
||||
process_filter: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ProcessFilterConfig {
|
||||
/// Path to external file containing process filter (optional)
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub include: Option<PathBuf>,
|
||||
|
||||
/// Filter mode: "whitelist" or "blacklist"
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub filter_mode: Option<ProcessFilterMode>,
|
||||
|
||||
/// List of process names to filter (case-insensitive substring match)
|
||||
#[serde(default)]
|
||||
pub names: Vec<String>,
|
||||
|
||||
/// List of regex patterns to match process names
|
||||
#[serde(default)]
|
||||
pub patterns: Vec<String>,
|
||||
|
||||
/// List of process PIDs to filter
|
||||
#[serde(default)]
|
||||
pub pids: Vec<u32>,
|
||||
|
||||
/// Compiled regex patterns (not serialized, built at runtime)
|
||||
#[serde(skip)]
|
||||
compiled_patterns: Option<Vec<Regex>>,
|
||||
}
|
||||
|
||||
impl ProcessFilterConfig {
|
||||
/// Load and merge process filter from include file if specified
|
||||
pub fn load_with_includes(& self, config_dir: Option<&Path>) -> Result<Self> {
|
||||
if let Some(include_path) = &self.include {
|
||||
// Resolve path relative to config directory if provided
|
||||
let full_path = if include_path.is_absolute() {
|
||||
include_path.clone()
|
||||
} else if let Some(dir) = config_dir {
|
||||
dir.join(include_path)
|
||||
} else {
|
||||
include_path.clone()
|
||||
};
|
||||
|
||||
// Read and parse the included file
|
||||
let content = std::fs::read_to_string(&full_path)
|
||||
.with_context(|| format!("Failed to read process filter file: {}", full_path.display()))?;
|
||||
|
||||
let included: ProcessFilterConfig = toml::from_str(&content)
|
||||
.with_context(|| format!("Failed to parse process filter file: {}", full_path.display()))?;
|
||||
|
||||
// Merge: included file takes precedence
|
||||
let mut merged = Self {
|
||||
include: None,
|
||||
filter_mode: included.filter_mode.or(self.filter_mode),
|
||||
names: if included.names.is_empty() {
|
||||
self.names.clone()
|
||||
} else {
|
||||
included.names
|
||||
},
|
||||
patterns: if included.patterns.is_empty() {
|
||||
self.patterns.clone()
|
||||
} else {
|
||||
included.patterns
|
||||
},
|
||||
pids: if included.pids.is_empty() {
|
||||
self.pids.clone()
|
||||
} else {
|
||||
included.pids
|
||||
},
|
||||
compiled_patterns: None,
|
||||
};
|
||||
|
||||
merged.compile_patterns()?;
|
||||
Ok(merged)
|
||||
} else {
|
||||
let mut result = self.clone();
|
||||
result.compile_patterns()?;
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
/// Compile regex patterns from strings
|
||||
fn compile_patterns(&mut self) -> Result<()> {
|
||||
if self.patterns.is_empty() {
|
||||
self.compiled_patterns = None;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut compiled = Vec::new();
|
||||
for pattern in &self.patterns {
|
||||
let regex = Regex::new(pattern)
|
||||
.with_context(|| format!("Invalid regex pattern: {}", pattern))?;
|
||||
compiled.push(regex);
|
||||
}
|
||||
|
||||
self.compiled_patterns = Some(compiled);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check if a process should be included based on filter configuration
|
||||
pub fn should_include_process(&self, process_name: &str, process_pid: u32) -> bool {
|
||||
let filter_mode = match &self.filter_mode {
|
||||
Some(mode) => mode,
|
||||
None => return true,
|
||||
};
|
||||
|
||||
// Check if process matches the filter lists
|
||||
let matches_name = self
|
||||
.names
|
||||
.iter()
|
||||
.any(|name| process_name.to_lowercase().contains(&name.to_lowercase()));
|
||||
|
||||
let matches_pattern = if let Some(patterns) = &self.compiled_patterns {
|
||||
patterns.iter().any(|regex| regex.is_match(process_name))
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
let matches_pid = self.pids.contains(&process_pid);
|
||||
let matches = matches_name || matches_pattern || matches_pid;
|
||||
|
||||
match filter_mode {
|
||||
ProcessFilterMode::Whitelist => matches,
|
||||
ProcessFilterMode::Blacklist => !matches,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum ProcessFilterMode {
|
||||
Whitelist,
|
||||
Blacklist,
|
||||
}
|
||||
|
||||
// Default functions
|
||||
fn default_endpoint() -> String {
|
||||
"http://localhost:4317".to_string()
|
||||
}
|
||||
|
||||
fn default_export_interval() -> u64 {
|
||||
10
|
||||
}
|
||||
|
||||
fn default_collection_interval() -> u64 {
|
||||
5
|
||||
}
|
||||
|
||||
fn default_service_name() -> String {
|
||||
"symon".to_string()
|
||||
}
|
||||
|
||||
fn default_service_version() -> String {
|
||||
env!("CARGO_PKG_VERSION").to_string()
|
||||
}
|
||||
|
||||
fn default_timeout() -> u64 {
|
||||
30
|
||||
}
|
||||
|
||||
fn default_true() -> bool {
|
||||
true
|
||||
}
|
||||
196
src/exporter.rs
Normal file
196
src/exporter.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
use crate::collector::SystemMetrics;
|
||||
use crate::config::OtlpConfig;
|
||||
use anyhow::{Context, Result};
|
||||
use opentelemetry::metrics::MeterProvider;
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry_otlp::WithExportConfig;
|
||||
use opentelemetry_sdk::metrics::{PeriodicReader, SdkMeterProvider};
|
||||
use opentelemetry_sdk::Resource;
|
||||
|
||||
pub struct MetricsExporter {
|
||||
meter_provider: SdkMeterProvider,
|
||||
gauges: MetricInstruments,
|
||||
}
|
||||
|
||||
struct MetricInstruments {
|
||||
cpu_usage: opentelemetry::metrics::Gauge<f64>,
|
||||
memory_usage: opentelemetry::metrics::Gauge<u64>,
|
||||
memory_total: opentelemetry::metrics::Gauge<u64>,
|
||||
swap_usage: opentelemetry::metrics::Gauge<u64>,
|
||||
swap_total: opentelemetry::metrics::Gauge<u64>,
|
||||
network_rx: opentelemetry::metrics::Counter<u64>,
|
||||
network_tx: opentelemetry::metrics::Counter<u64>,
|
||||
disk_usage: opentelemetry::metrics::Gauge<u64>,
|
||||
disk_total: opentelemetry::metrics::Gauge<u64>,
|
||||
process_cpu: opentelemetry::metrics::Gauge<f64>,
|
||||
process_memory: opentelemetry::metrics::Gauge<u64>,
|
||||
temperature: opentelemetry::metrics::Gauge<f64>,
|
||||
}
|
||||
|
||||
impl MetricsExporter {
|
||||
pub async fn new(config: &OtlpConfig) -> Result<Self> {
|
||||
// Build resource with service information
|
||||
let mut resource_kvs = vec![
|
||||
KeyValue::new("service.name", config.service_name.clone()),
|
||||
KeyValue::new("service.version", config.service_version.clone()),
|
||||
];
|
||||
|
||||
// Add custom resource attributes
|
||||
for (key, value) in &config.resource_attributes {
|
||||
resource_kvs.push(KeyValue::new(key.clone(), value.clone()));
|
||||
}
|
||||
|
||||
let resource = Resource::new(resource_kvs);
|
||||
|
||||
// Build OTLP exporter using new pipeline API
|
||||
let exporter = opentelemetry_otlp::new_exporter()
|
||||
.tonic()
|
||||
.with_endpoint(&config.endpoint)
|
||||
.with_timeout(config.export_timeout())
|
||||
.build_metrics_exporter(
|
||||
Box::new(opentelemetry_sdk::metrics::reader::DefaultTemporalitySelector::default())
|
||||
)
|
||||
.context("Failed to build OTLP metrics exporter")?;
|
||||
|
||||
// Build meter provider
|
||||
let reader = PeriodicReader::builder(exporter, opentelemetry_sdk::runtime::Tokio)
|
||||
.with_interval(config.export_interval())
|
||||
.build();
|
||||
|
||||
let meter_provider = SdkMeterProvider::builder()
|
||||
.with_reader(reader)
|
||||
.with_resource(resource)
|
||||
.build();
|
||||
|
||||
// Create meter and instruments
|
||||
let meter = meter_provider.meter("symon");
|
||||
|
||||
let gauges = MetricInstruments {
|
||||
cpu_usage: meter
|
||||
.f64_gauge("system_cpu_usage_percent")
|
||||
.with_description("CPU usage percentage per core")
|
||||
.init(),
|
||||
memory_usage: meter
|
||||
.u64_gauge("system_memory_usage_bytes")
|
||||
.with_description("Memory usage in bytes")
|
||||
.init(),
|
||||
memory_total: meter
|
||||
.u64_gauge("system_memory_total_bytes")
|
||||
.with_description("Total memory in bytes")
|
||||
.init(),
|
||||
swap_usage: meter
|
||||
.u64_gauge("system_swap_usage_bytes")
|
||||
.with_description("Swap usage in bytes")
|
||||
.init(),
|
||||
swap_total: meter
|
||||
.u64_gauge("system_swap_total_bytes")
|
||||
.with_description("Total swap in bytes")
|
||||
.init(),
|
||||
network_rx: meter
|
||||
.u64_counter("system_network_rx_bytes_total")
|
||||
.with_description("Total bytes received")
|
||||
.init(),
|
||||
network_tx: meter
|
||||
.u64_counter("system_network_tx_bytes_total")
|
||||
.with_description("Total bytes transmitted")
|
||||
.init(),
|
||||
disk_usage: meter
|
||||
.u64_gauge("system_disk_usage_bytes")
|
||||
.with_description("Disk usage in bytes")
|
||||
.init(),
|
||||
disk_total: meter
|
||||
.u64_gauge("system_disk_total_bytes")
|
||||
.with_description("Total disk space in bytes")
|
||||
.init(),
|
||||
process_cpu: meter
|
||||
.f64_gauge("system_process_cpu_usage_percent")
|
||||
.with_description("Process CPU usage percentage")
|
||||
.init(),
|
||||
process_memory: meter
|
||||
.u64_gauge("system_process_memory_usage_bytes")
|
||||
.with_description("Process memory usage in bytes")
|
||||
.init(),
|
||||
temperature: meter
|
||||
.f64_gauge("system_temperature_celsius")
|
||||
.with_description("Temperature in Celsius")
|
||||
.init(),
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
meter_provider,
|
||||
gauges,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn export(&self, metrics: &SystemMetrics) {
|
||||
// Export CPU metrics
|
||||
if let Some(cpu_metrics) = &metrics.cpu {
|
||||
for cpu in cpu_metrics {
|
||||
self.gauges.cpu_usage.record(
|
||||
cpu.usage_percent as f64,
|
||||
&[KeyValue::new("cpu_id", cpu.core_index as i64)],
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Export memory metrics
|
||||
if let Some(memory) = &metrics.memory {
|
||||
self.gauges.memory_usage.record(memory.used_bytes, &[]);
|
||||
self.gauges.memory_total.record(memory.total_bytes, &[]);
|
||||
self.gauges.swap_usage.record(memory.swap_used_bytes, &[]);
|
||||
self.gauges.swap_total.record(memory.swap_total_bytes, &[]);
|
||||
}
|
||||
|
||||
// Export network metrics
|
||||
if let Some(network_metrics) = &metrics.network {
|
||||
for net in network_metrics {
|
||||
let attrs = &[KeyValue::new("interface", net.interface_name.clone())];
|
||||
self.gauges.network_rx.add(net.rx_bytes_total, attrs);
|
||||
self.gauges.network_tx.add(net.tx_bytes_total, attrs);
|
||||
}
|
||||
}
|
||||
|
||||
// Export disk metrics
|
||||
if let Some(disk_metrics) = &metrics.disk {
|
||||
for disk in disk_metrics {
|
||||
let attrs = &[
|
||||
KeyValue::new("device", disk.device_name.clone()),
|
||||
KeyValue::new("mount", disk.mount_point.clone()),
|
||||
];
|
||||
self.gauges.disk_usage.record(disk.used_bytes, attrs);
|
||||
self.gauges.disk_total.record(disk.total_bytes, attrs);
|
||||
}
|
||||
}
|
||||
|
||||
// Export process metrics
|
||||
if let Some(process_metrics) = &metrics.processes {
|
||||
for process in process_metrics {
|
||||
let attrs = &[
|
||||
KeyValue::new("pid", process.pid as i64),
|
||||
KeyValue::new("name", process.name.clone()),
|
||||
];
|
||||
self.gauges
|
||||
.process_cpu
|
||||
.record(process.cpu_usage_percent as f64, attrs);
|
||||
self.gauges.process_memory.record(process.memory_bytes, attrs);
|
||||
}
|
||||
}
|
||||
|
||||
// Export temperature metrics
|
||||
if let Some(temp_metrics) = &metrics.temperature {
|
||||
for temp in temp_metrics {
|
||||
self.gauges.temperature.record(
|
||||
temp.temperature_celsius as f64,
|
||||
&[KeyValue::new("sensor", temp.sensor_name.clone())],
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn shutdown(self) -> Result<()> {
|
||||
self.meter_provider
|
||||
.shutdown()
|
||||
.context("Failed to shutdown meter provider")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
108
src/main.rs
Normal file
108
src/main.rs
Normal file
@@ -0,0 +1,108 @@
|
||||
mod collector;
|
||||
mod config;
|
||||
mod exporter;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use clap::Parser;
|
||||
use collector::MetricsCollector;
|
||||
use config::Config;
|
||||
use exporter::MetricsExporter;
|
||||
use std::path::PathBuf;
|
||||
use tokio::signal;
|
||||
use tokio::time::interval;
|
||||
use tracing::{error, info};
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "symon")]
|
||||
#[command(about = "Lightweight system metrics exporter for OpenTelemetry", long_about = None)]
|
||||
#[command(version)]
|
||||
struct Args {
|
||||
/// Path to configuration file
|
||||
#[arg(short, long, value_name = "FILE")]
|
||||
config: Option<PathBuf>,
|
||||
|
||||
/// Log level (trace, debug, info, warn, error)
|
||||
#[arg(short, long, default_value = "info")]
|
||||
log_level: String,
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
// Initialize tracing
|
||||
let log_level = args.log_level.parse().unwrap_or(tracing::Level::INFO);
|
||||
tracing_subscriber::fmt()
|
||||
.with_max_level(log_level)
|
||||
.with_target(false)
|
||||
.init();
|
||||
|
||||
info!("Starting symon v{}", env!("CARGO_PKG_VERSION"));
|
||||
|
||||
// Load configuration
|
||||
let config_path = args
|
||||
.config
|
||||
.or_else(find_default_config)
|
||||
.context("No configuration file specified and no default config found")?;
|
||||
|
||||
info!("Loading configuration from: {}", config_path.display());
|
||||
let config = Config::from_file(&config_path)?;
|
||||
|
||||
info!(
|
||||
"OTLP endpoint: {}, export interval: {}s, collection interval: {}s",
|
||||
config.otlp.endpoint, config.otlp.export_interval_secs, config.collection_interval_secs
|
||||
);
|
||||
|
||||
// Initialize metrics collector
|
||||
let mut collector = MetricsCollector::new(config.metrics.clone());
|
||||
|
||||
// Initialize OTLP exporter
|
||||
info!("Initializing OTLP exporter...");
|
||||
let exporter = MetricsExporter::new(&config.otlp)
|
||||
.await
|
||||
.context("Failed to initialize OTLP exporter")?;
|
||||
|
||||
info!("Symon initialized successfully");
|
||||
info!("Press Ctrl+C to stop");
|
||||
|
||||
// Main collection loop
|
||||
let mut tick_interval = interval(config.collection_interval());
|
||||
let mut shutdown = Box::pin(signal::ctrl_c());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = tick_interval.tick() => {
|
||||
match collector.collect() {
|
||||
Ok(metrics) => {
|
||||
exporter.export(&metrics);
|
||||
tracing::debug!("Metrics collected and exported");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to collect metrics: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = &mut shutdown => {
|
||||
info!("Shutdown signal received");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutdown gracefully
|
||||
info!("Shutting down...");
|
||||
exporter.shutdown().await?;
|
||||
info!("Symon stopped");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_default_config() -> Option<PathBuf> {
|
||||
let candidates = vec![
|
||||
PathBuf::from("symon.toml"),
|
||||
PathBuf::from("/etc/symon/symon.toml"),
|
||||
PathBuf::from("config.toml"),
|
||||
];
|
||||
|
||||
candidates.into_iter().find(|p| p.exists())
|
||||
}
|
||||
76
symon.toml
Normal file
76
symon.toml
Normal file
@@ -0,0 +1,76 @@
|
||||
# Symon Configuration File
|
||||
# Lightweight system metrics exporter for OpenTelemetry
|
||||
|
||||
# Collection interval in seconds
|
||||
# How often to collect system metrics
|
||||
collection_interval_secs = 5
|
||||
|
||||
# OTLP configuration
|
||||
[otlp]
|
||||
# OTLP endpoint (gRPC)
|
||||
endpoint = "http://localhost:4317"
|
||||
|
||||
# Export interval in seconds
|
||||
# How often to export metrics to OTLP collector
|
||||
export_interval_secs = 10
|
||||
|
||||
# Service name that will appear in metrics
|
||||
service_name = "symon"
|
||||
|
||||
# Service version
|
||||
service_version = "0.1.0"
|
||||
|
||||
# Export timeout in seconds
|
||||
export_timeout_secs = 30
|
||||
|
||||
# Additional resource attributes (key-value pairs)
|
||||
[otlp.resource_attributes]
|
||||
environment = "production"
|
||||
# host = "server-01"
|
||||
# datacenter = "us-east-1"
|
||||
|
||||
# Metrics configuration - enable/disable specific metric types
|
||||
[metrics]
|
||||
cpu = true # CPU usage per core
|
||||
memory = true # RAM and swap usage
|
||||
network = true # Network RX/TX
|
||||
disk = true # Disk usage
|
||||
processes = false # Top 10 processes (disabled by default - can generate high cardinality)
|
||||
temperature = true # System temperatures (if available)
|
||||
|
||||
# Process filtering configuration
|
||||
# Only used when processes = true
|
||||
[metrics.process_filter]
|
||||
# Option 1: Use an external file for server-specific process lists
|
||||
# include = "processes.toml"
|
||||
|
||||
# Option 2: Configure inline
|
||||
# Filter mode: "whitelist" (only listed processes) or "blacklist" (exclude listed)
|
||||
filter_mode = "whitelist"
|
||||
|
||||
# List of process names to filter (case-insensitive substring match)
|
||||
names = [
|
||||
# Web servers
|
||||
"nginx",
|
||||
"apache",
|
||||
|
||||
# Databases
|
||||
"postgres",
|
||||
"mysql",
|
||||
"redis",
|
||||
|
||||
# Application servers
|
||||
# "java",
|
||||
# "node",
|
||||
# "python",
|
||||
]
|
||||
|
||||
# List of regex patterns to match process names (case-sensitive)
|
||||
patterns = [
|
||||
# Example: Match specific versions
|
||||
# "^nginx-[0-9.]+$",
|
||||
# "^node-v[0-9]+",
|
||||
]
|
||||
|
||||
# List of specific process PIDs to filter
|
||||
pids = []
|
||||
Reference in New Issue
Block a user