complete observability stak alloy
This commit is contained in:
227
docker-compose-alloy/METRICS.md
Normal file
227
docker-compose-alloy/METRICS.md
Normal file
@@ -0,0 +1,227 @@
|
||||
# Symon OpenTelemetry Metrics Reference
|
||||
|
||||
This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled.
|
||||
|
||||
## System Metrics
|
||||
|
||||
### CPU
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_cpu_usage_percent` | Gauge | `cpu_id` | CPU usage percentage per core |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Average CPU across all cores
|
||||
avg(system_cpu_usage_percent)
|
||||
|
||||
# CPU usage for core 0
|
||||
system_cpu_usage_percent{cpu_id="0"}
|
||||
```
|
||||
|
||||
### Memory
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_memory_usage_bytes` | Gauge | - | RAM memory currently in use |
|
||||
| `system_memory_total_bytes` | Gauge | - | Total RAM memory available |
|
||||
| `system_swap_usage_bytes` | Gauge | - | Swap memory currently in use |
|
||||
| `system_swap_total_bytes` | Gauge | - | Total swap memory available |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Memory usage percentage
|
||||
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||
|
||||
# Available memory
|
||||
system_memory_total_bytes - system_memory_usage_bytes
|
||||
```
|
||||
|
||||
### Network
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_network_rx_bytes_rate` | Gauge | `interface` | Network receive rate in bytes/sec |
|
||||
| `system_network_tx_bytes_rate` | Gauge | `interface` | Network transmit rate in bytes/sec |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Total network throughput
|
||||
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||
|
||||
# RX rate for specific interface
|
||||
system_network_rx_bytes_rate{interface="eth0"}
|
||||
```
|
||||
|
||||
### Disk
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_disk_usage_bytes` | Gauge | `device`, `mount` | Disk space currently in use |
|
||||
| `system_disk_total_bytes` | Gauge | `device`, `mount` | Total disk space available |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Disk usage percentage
|
||||
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||
|
||||
# Free disk space
|
||||
system_disk_total_bytes - system_disk_usage_bytes
|
||||
```
|
||||
|
||||
### Temperature
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_temperature_celsius` | Gauge | `sensor` | Temperature readings in Celsius |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Average temperature across all sensors
|
||||
avg(system_temperature_celsius)
|
||||
|
||||
# Maximum temperature
|
||||
max(system_temperature_celsius)
|
||||
```
|
||||
|
||||
## Process Metrics
|
||||
|
||||
| Metric Name | Type | Labels | Description |
|
||||
|------------|------|--------|-------------|
|
||||
| `system_process_cpu_usage_percent` | Gauge | `name`, `pid` | CPU usage percentage per process |
|
||||
| `system_process_memory_usage_bytes` | Gauge | `name`, `pid` | Memory usage in bytes per process |
|
||||
| `system_process_count` | Gauge | - | Total number of processes |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Top 10 processes by CPU
|
||||
topk(10, system_process_cpu_usage_percent)
|
||||
|
||||
# Top 10 processes by memory
|
||||
topk(10, system_process_memory_usage_bytes)
|
||||
|
||||
# Total memory used by all Chrome processes
|
||||
sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
|
||||
```
|
||||
|
||||
## Recording Rules
|
||||
|
||||
The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`):
|
||||
|
||||
| Rule Name | Expression | Description |
|
||||
|-----------|------------|-------------|
|
||||
| `system_process_cpu_usage_percent:recent` | Recent process CPU metrics | Filters out stale process data (>2 min old) |
|
||||
| `system_process_memory_usage_bytes:recent` | Recent process memory metrics | Filters out stale process data (>2 min old) |
|
||||
|
||||
**Example:**
|
||||
```promql
|
||||
# Query only recent process data
|
||||
topk(10, system_process_cpu_usage_percent:recent)
|
||||
```
|
||||
|
||||
## Common Queries
|
||||
|
||||
### System Health
|
||||
|
||||
```promql
|
||||
# Overall system CPU usage
|
||||
avg(system_cpu_usage_percent)
|
||||
|
||||
# Memory pressure (>80% is high)
|
||||
(system_memory_usage_bytes / system_memory_total_bytes) * 100
|
||||
|
||||
# Disk pressure (>90% is critical)
|
||||
(system_disk_usage_bytes / system_disk_total_bytes) * 100
|
||||
```
|
||||
|
||||
### Resource Hogs
|
||||
|
||||
```promql
|
||||
# Top CPU consumers
|
||||
topk(5, system_process_cpu_usage_percent)
|
||||
|
||||
# Top memory consumers
|
||||
topk(5, system_process_memory_usage_bytes)
|
||||
|
||||
# Processes using >1GB memory
|
||||
system_process_memory_usage_bytes > 1073741824
|
||||
```
|
||||
|
||||
### Network Analysis
|
||||
|
||||
```promql
|
||||
# Total network traffic (RX + TX)
|
||||
sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
|
||||
|
||||
# Network traffic by interface
|
||||
sum by (interface) (system_network_rx_bytes_rate + system_network_tx_bytes_rate)
|
||||
|
||||
# Interfaces with high RX rate (>10MB/s)
|
||||
system_network_rx_bytes_rate > 10485760
|
||||
```
|
||||
|
||||
## Alerting Examples
|
||||
|
||||
### Sample Prometheus Alert Rules
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: Symon_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- alert: HighCPUUsage
|
||||
expr: avg(system_cpu_usage_percent) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage detected"
|
||||
description: "Average CPU usage is {{ $value }}%"
|
||||
|
||||
- alert: HighMemoryUsage
|
||||
expr: (system_memory_usage_bytes / system_memory_total_bytes) * 100 > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage detected"
|
||||
description: "Memory usage is {{ $value }}%"
|
||||
|
||||
- alert: DiskAlmostFull
|
||||
expr: (system_disk_usage_bytes / system_disk_total_bytes) * 100 > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk {{ $labels.mount }} almost full"
|
||||
description: "Disk usage is {{ $value }}% on {{ $labels.mount }}"
|
||||
```
|
||||
|
||||
## Label Reference
|
||||
|
||||
| Label | Used In | Description |
|
||||
|-------|---------|-------------|
|
||||
| `cpu_id` | CPU metrics | CPU core identifier (0, 1, 2, ...) |
|
||||
| `interface` | Network metrics | Network interface name (eth0, wlan0, ...) |
|
||||
| `device` | Disk metrics | Device name (/dev/sda1, ...) |
|
||||
| `mount` | Disk metrics | Mount point (/, /home, ...) |
|
||||
| `sensor` | Temperature | Temperature sensor name |
|
||||
| `name` | Process metrics | Process name |
|
||||
| `pid` | Process metrics | Process ID |
|
||||
| `exported_job` | All | Always "Symon-system-monitor" |
|
||||
| `otel_scope_name` | All | Always "Symon-system-monitor" |
|
||||
|
||||
## Data Retention
|
||||
|
||||
By default, Prometheus stores metrics for 15 days. You can adjust this in the Prometheus configuration:
|
||||
|
||||
```yaml
|
||||
# In prometheus.yml
|
||||
global:
|
||||
retention_time: 30d # Keep data for 30 days
|
||||
```
|
||||
|
||||
For long-term storage, consider using:
|
||||
- **TimescaleDB** (see `docker-compose-timescale.yml.ko`)
|
||||
- **Thanos** for multi-cluster metrics
|
||||
- **Cortex** for horizontally scalable storage
|
||||
148
docker-compose-alloy/README-ALLOY.md
Normal file
148
docker-compose-alloy/README-ALLOY.md
Normal file
@@ -0,0 +1,148 @@
|
||||
# Stack Observability con Grafana Alloy
|
||||
|
||||
Questa directory contiene uno stack di observability **semplificato** che usa **Grafana Alloy** al posto di tre componenti separati.
|
||||
|
||||
## Differenze con docker-compose originale
|
||||
|
||||
### Setup Originale (7 servizi)
|
||||
```
|
||||
- Promtail → Raccolta log dai container
|
||||
- Node Exporter → Metriche sistema host
|
||||
- OTEL Collector → Ricezione telemetria OTLP
|
||||
- Prometheus → Storage metriche
|
||||
- Loki → Storage log
|
||||
- Tempo → Storage traces
|
||||
- Grafana → Visualizzazione
|
||||
```
|
||||
|
||||
### Setup con Alloy (5 servizi) ⭐
|
||||
```
|
||||
- Alloy → Raccolta log + metriche host + OTLP (tutto in uno!)
|
||||
- Prometheus → Storage metriche
|
||||
- Loki → Storage log
|
||||
- Tempo → Storage traces
|
||||
- Grafana → Visualizzazione
|
||||
+ Pyroscope → Continuous profiling
|
||||
```
|
||||
|
||||
## Cosa fa Alloy
|
||||
|
||||
Grafana Alloy sostituisce **3 servizi** in uno:
|
||||
|
||||
| Funzione | Prima | Dopo |
|
||||
|----------|-------|------|
|
||||
| **Raccolta log Docker** | Promtail | Alloy |
|
||||
| **Metriche sistema host** | Node Exporter | Alloy (unix exporter integrato) |
|
||||
| **Ricezione OTLP** | OTEL Collector | Alloy (otelcol receiver) |
|
||||
|
||||
### Vantaggi di Alloy:
|
||||
- ✅ **Meno container** da gestire (5 invece di 7)
|
||||
- ✅ **Configurazione unificata** in un solo file (alloy-config.alloy)
|
||||
- ✅ **Meno risorse** consumate (CPU/RAM)
|
||||
- ✅ **UI integrata** per debugging (http://localhost:12345)
|
||||
- ✅ **Più moderno** (linguaggio River invece di YAML)
|
||||
- ✅ **Hot reload** della configurazione
|
||||
|
||||
### Come funziona:
|
||||
|
||||
1. **Logs**: Alloy scopre i container con label `logging=alloy` e invia i log a Loki
|
||||
2. **Metrics**: Alloy raccoglie metriche host e le invia a Prometheus via Remote Write
|
||||
3. **Traces**: Alloy riceve traces OTLP e le inoltra a Tempo
|
||||
|
||||
## Porte esposte
|
||||
|
||||
| Servizio | Porta | Descrizione |
|
||||
|----------|-------|-------------|
|
||||
| Alloy | 12345 | UI e metriche self-monitoring |
|
||||
| Alloy | 4317 | OTLP gRPC (traces/metrics) |
|
||||
| Alloy | 4318 | OTLP HTTP (traces/metrics) |
|
||||
| Prometheus | 9090 | Query e UI |
|
||||
| Loki | 3100 | Push/Query API |
|
||||
| Tempo | 3200 | Query API |
|
||||
| Pyroscope | 4040 | Profiling UI e API |
|
||||
| Grafana | 3000 | Dashboard |
|
||||
|
||||
## Come usare
|
||||
|
||||
### Avviare lo stack:
|
||||
```bash
|
||||
cd docker-compose-alloy
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
### Verificare Alloy:
|
||||
```bash
|
||||
# UI di Alloy (molto utile per debugging!)
|
||||
open http://localhost:12345
|
||||
|
||||
# Vedere i target scoperti
|
||||
curl http://localhost:12345/api/v0/component/discovery.docker.containers/targets
|
||||
|
||||
# Vedere la configurazione caricata
|
||||
curl http://localhost:12345/api/v0/web/components
|
||||
```
|
||||
|
||||
### Accedere ai servizi:
|
||||
- **Grafana**: http://localhost:3000 (admin/admin)
|
||||
- **Alloy UI**: http://localhost:12345
|
||||
- **Prometheus**: http://localhost:9090
|
||||
- **Pyroscope**: http://localhost:4040
|
||||
|
||||
## Configurazione
|
||||
|
||||
### File principale:
|
||||
- **alloy-config.alloy**: Configurazione unificata di Alloy (sostituisce promtail-config.yml e otel-collector-config.yml)
|
||||
- **prometheus.yml**: Configurazione Prometheus (semplificata, Alloy fa remote write)
|
||||
- **loki-config.yml**: Configurazione Loki (invariata)
|
||||
- **tempo-config.yml**: Configurazione Tempo (invariata)
|
||||
|
||||
### Modificare la configurazione di Alloy:
|
||||
|
||||
1. Modifica `alloy-config.alloy`
|
||||
2. Riavvia il container: `docker-compose restart alloy`
|
||||
3. Verifica la configurazione: http://localhost:12345
|
||||
|
||||
## Metriche disponibili
|
||||
|
||||
Alloy espone le stesse metriche di Node Exporter con prefisso `node_*`:
|
||||
```promql
|
||||
# CPU usage
|
||||
rate(node_cpu_seconds_total[5m])
|
||||
|
||||
# Memory
|
||||
node_memory_MemAvailable_bytes
|
||||
|
||||
# Disk
|
||||
node_filesystem_avail_bytes
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Verificare che Alloy stia raccogliendo log:
|
||||
```bash
|
||||
docker-compose logs alloy | grep loki
|
||||
```
|
||||
|
||||
### Verificare che Alloy stia inviando metriche a Prometheus:
|
||||
```bash
|
||||
curl http://localhost:9090/api/v1/label/__name__/values | grep node_
|
||||
```
|
||||
|
||||
### Vedere i componenti attivi in Alloy:
|
||||
```bash
|
||||
curl http://localhost:12345/api/v0/web/components | jq
|
||||
```
|
||||
|
||||
## Migrazione dalla versione originale
|
||||
|
||||
Se hai già la versione con Promtail/Node-Exporter/OTEL:
|
||||
|
||||
1. Ferma lo stack vecchio: `cd ../docker-compose && docker-compose down`
|
||||
2. Avvia il nuovo: `cd ../docker-compose-alloy && docker-compose up -d`
|
||||
3. I dati storici in Prometheus/Loki/Tempo sono preservati nei volumi Docker
|
||||
|
||||
## Link utili
|
||||
|
||||
- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/)
|
||||
- [Alloy Configuration Reference](https://grafana.com/docs/alloy/latest/reference/)
|
||||
- [River Language](https://grafana.com/docs/alloy/latest/concepts/configuration-syntax/)
|
||||
195
docker-compose-alloy/README.md
Normal file
195
docker-compose-alloy/README.md
Normal file
@@ -0,0 +1,195 @@
|
||||
# Symon OpenTelemetry Docker Compose Setup
|
||||
|
||||
This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry.
|
||||
|
||||
## Architecture
|
||||
|
||||
The stack includes:
|
||||
|
||||
1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol
|
||||
2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
|
||||
3. **Grafana** - Visualizes metrics from Prometheus
|
||||
|
||||
```
|
||||
Symon (with --headless flag)
|
||||
↓ (OTLP/gRPC on port 4317)
|
||||
OpenTelemetry Collector
|
||||
↓ (Prometheus scrape on port 8889)
|
||||
Prometheus
|
||||
↓ (Query on port 9090)
|
||||
Grafana (accessible on port 3000)
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### 1. Start the observability stack
|
||||
|
||||
```bash
|
||||
cd docker-compose
|
||||
docker-compose up -d
|
||||
```
|
||||
|
||||
This will start:
|
||||
- OpenTelemetry Collector on ports 4317 (gRPC), 4318 (HTTP), 8889 (metrics)
|
||||
- Prometheus on port 9090
|
||||
- Grafana on port 3000
|
||||
|
||||
### 2. Build Symon with OpenTelemetry support
|
||||
|
||||
```bash
|
||||
cd ..
|
||||
cargo build --release --features opentelemetry
|
||||
```
|
||||
|
||||
### 3. Create a configuration file
|
||||
|
||||
Create a `Symon-config.toml` file:
|
||||
|
||||
```toml
|
||||
[opentelemetry]
|
||||
enabled = true
|
||||
endpoint = "http://localhost:4317"
|
||||
service_name = "Symon-system-monitor"
|
||||
export_interval_ms = 5000
|
||||
|
||||
[opentelemetry.metrics]
|
||||
cpu = true
|
||||
memory = true
|
||||
network = true
|
||||
disk = true
|
||||
processes = true
|
||||
temperature = true
|
||||
gpu = true
|
||||
```
|
||||
|
||||
### 4. Run Symon in headless mode
|
||||
|
||||
```bash
|
||||
./target/release/btm --config Symon-config.toml --headless
|
||||
```
|
||||
|
||||
Or without config file:
|
||||
|
||||
```bash
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
|
||||
./target/release/btm --headless
|
||||
```
|
||||
|
||||
### 5. Access the dashboards
|
||||
|
||||
- **Prometheus**: http://localhost:9090
|
||||
- **Grafana**: http://localhost:3000 (username: `admin`, password: `admin`)
|
||||
|
||||
## Configuration Files
|
||||
|
||||
### otel-collector-config.yml
|
||||
|
||||
Configures the OpenTelemetry Collector to:
|
||||
- Receive OTLP data on ports 4317 (gRPC) and 4318 (HTTP)
|
||||
- Export metrics in Prometheus format on port 9090
|
||||
- Debug log all received data
|
||||
|
||||
### prometheus.yml
|
||||
|
||||
Configures Prometheus to:
|
||||
- Scrape metrics from the OTEL Collector every 10 seconds
|
||||
- Load alerting rules from `rules/Symon_rules.yml`
|
||||
|
||||
### rules/Symon_rules.yml
|
||||
|
||||
Contains Prometheus recording rules for Symon metrics, including:
|
||||
- Recent process CPU usage metrics
|
||||
- Recent process memory usage metrics
|
||||
|
||||
## Viewing Metrics in Prometheus
|
||||
|
||||
1. Go to http://localhost:9090
|
||||
2. Click on "Graph"
|
||||
3. Try these example queries:
|
||||
|
||||
```promql
|
||||
# CPU usage by core
|
||||
system_cpu_usage_percent
|
||||
|
||||
# Memory usage
|
||||
system_memory_usage_bytes
|
||||
|
||||
# Network RX/TX
|
||||
system_network_rx_bytes
|
||||
system_network_tx_bytes
|
||||
|
||||
# Disk usage
|
||||
system_disk_usage_bytes
|
||||
|
||||
# Top processes by CPU
|
||||
topk(10, system_process_cpu_usage_percent)
|
||||
|
||||
# Top processes by memory
|
||||
topk(10, system_process_memory_usage_bytes)
|
||||
```
|
||||
|
||||
## Grafana Configuration
|
||||
|
||||
Grafana is automatically configured with:
|
||||
- **Prometheus data source** (http://prometheus:9090) - pre-configured
|
||||
- **Symon System Overview dashboard** - pre-loaded
|
||||
|
||||
To access:
|
||||
1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
|
||||
2. Navigate to Dashboards → Browse → "Symon System Overview"
|
||||
|
||||
The dashboard includes:
|
||||
- CPU usage by core
|
||||
- Memory usage (RAM/Swap)
|
||||
- Network traffic
|
||||
- Disk usage
|
||||
- Top 10 processes by CPU
|
||||
- Top 10 processes by Memory
|
||||
|
||||
## Stopping the Stack
|
||||
|
||||
```bash
|
||||
docker-compose down
|
||||
```
|
||||
|
||||
To also remove volumes:
|
||||
|
||||
```bash
|
||||
docker-compose down -v
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Symon not sending metrics
|
||||
|
||||
Check the OTEL Collector logs:
|
||||
```bash
|
||||
docker-compose logs -f otel-collector
|
||||
```
|
||||
|
||||
You should see messages about receiving metrics.
|
||||
|
||||
### Prometheus not scraping
|
||||
|
||||
1. Check Prometheus targets at http://localhost:9090/targets
|
||||
2. The `otel-collector` target should be UP
|
||||
|
||||
### No data in Grafana
|
||||
|
||||
1. Verify Prometheus data source is configured correctly
|
||||
2. Check that Prometheus has data by querying directly
|
||||
3. Ensure your time range in Grafana includes when Symon was running
|
||||
|
||||
## Advanced Configuration
|
||||
|
||||
### Using with TimescaleDB (optional)
|
||||
|
||||
A TimescaleDB configuration file is available as `docker-compose-timescale.yml.ko` for long-term storage of metrics. Rename it to include it in your stack.
|
||||
|
||||
### Custom Prometheus Rules
|
||||
|
||||
Edit `rules/Symon_rules.yml` to add custom recording or alerting rules.
|
||||
|
||||
### OTEL Collector Sampling
|
||||
|
||||
Edit `otel-collector-config.yml` to adjust the batch processor settings for different performance characteristics.
|
||||
169
docker-compose-alloy/alloy-config.alloy
Normal file
169
docker-compose-alloy/alloy-config.alloy
Normal file
@@ -0,0 +1,169 @@
|
||||
// Grafana Alloy Configuration
|
||||
// Sostituisce: Promtail + Node Exporter + OTEL Collector
|
||||
|
||||
// ============================================================================
|
||||
// LOGGING - Raccolta log dai container Docker (sostituisce Promtail)
|
||||
// ============================================================================
|
||||
|
||||
// Scopri container Docker con label logging=promtail
|
||||
discovery.docker "containers" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
|
||||
filter {
|
||||
name = "label"
|
||||
values = ["logging=promtail"]
|
||||
}
|
||||
|
||||
refresh_interval = "5s"
|
||||
}
|
||||
|
||||
// Scrape log dai container scoperti
|
||||
loki.source.docker "containers" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
targets = discovery.docker.containers.targets
|
||||
forward_to = [loki.relabel.docker.receiver]
|
||||
}
|
||||
|
||||
// Relabeling per aggiungere label ai log
|
||||
loki.relabel "docker" {
|
||||
forward_to = [loki.write.default.receiver]
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_name"]
|
||||
regex = "/(.*)"
|
||||
target_label = "container"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_label_logging_jobname"]
|
||||
target_label = "job"
|
||||
}
|
||||
}
|
||||
|
||||
// Scrivi i log a Loki
|
||||
loki.write "default" {
|
||||
endpoint {
|
||||
url = "http://loki:3100/loki/api/v1/push"
|
||||
}
|
||||
|
||||
external_labels = {
|
||||
environment = "production",
|
||||
cluster = "myapp-cluster",
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// METRICS - Metriche sistema host (sostituisce Node Exporter)
|
||||
// ============================================================================
|
||||
|
||||
// Scraping metriche locali dell'host
|
||||
prometheus.exporter.unix "host" {
|
||||
// Raccoglie metriche del sistema operativo
|
||||
set_collectors = [
|
||||
"cpu",
|
||||
"loadavg",
|
||||
"meminfo",
|
||||
"netdev",
|
||||
"diskstats",
|
||||
"filesystem",
|
||||
"uname",
|
||||
"time",
|
||||
]
|
||||
}
|
||||
|
||||
// Scrape delle metriche raccolte
|
||||
prometheus.scrape "host_metrics" {
|
||||
targets = prometheus.exporter.unix.host.targets
|
||||
forward_to = [prometheus.remote_write.default.receiver]
|
||||
|
||||
scrape_interval = "10s"
|
||||
|
||||
clustering {
|
||||
enabled = false
|
||||
}
|
||||
}
|
||||
|
||||
// Scraping self-monitoring di Alloy
|
||||
prometheus.scrape "alloy_metrics" {
|
||||
targets = [{
|
||||
__address__ = "localhost:12345",
|
||||
}]
|
||||
forward_to = [prometheus.remote_write.default.receiver]
|
||||
|
||||
scrape_interval = "10s"
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// TRACES - Ricezione traces OTLP (sostituisce OTEL Collector)
|
||||
// ============================================================================
|
||||
|
||||
// Ricevi traces via OTLP gRPC
|
||||
otelcol.receiver.otlp "default" {
|
||||
grpc {
|
||||
endpoint = "0.0.0.0:4317"
|
||||
}
|
||||
|
||||
http {
|
||||
endpoint = "0.0.0.0:4318"
|
||||
}
|
||||
|
||||
output {
|
||||
metrics = [otelcol.processor.batch.default.input]
|
||||
logs = [otelcol.processor.batch.default.input]
|
||||
traces = [otelcol.processor.batch.default.input]
|
||||
}
|
||||
}
|
||||
|
||||
// Batch processor per ottimizzare l'invio
|
||||
otelcol.processor.batch "default" {
|
||||
timeout = "10s"
|
||||
send_batch_size = 10000
|
||||
send_batch_max_size = 11000
|
||||
|
||||
output {
|
||||
metrics = [otelcol.exporter.prometheus.default.input]
|
||||
traces = [otelcol.exporter.otlp.tempo.input]
|
||||
logs = [otelcol.exporter.loki.default.input]
|
||||
}
|
||||
}
|
||||
|
||||
// Esporta metriche OTLP a Prometheus
|
||||
otelcol.exporter.prometheus "default" {
|
||||
forward_to = [prometheus.remote_write.default.receiver]
|
||||
}
|
||||
|
||||
// Esporta traces a Tempo
|
||||
otelcol.exporter.otlp "tempo" {
|
||||
client {
|
||||
endpoint = "tempo:4317"
|
||||
tls {
|
||||
insecure = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Esporta log OTLP a Loki
|
||||
otelcol.exporter.loki "default" {
|
||||
forward_to = [loki.write.default.receiver]
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// REMOTE WRITE - Invia metriche a Prometheus
|
||||
// ============================================================================
|
||||
|
||||
prometheus.remote_write "default" {
|
||||
endpoint {
|
||||
url = "http://prometheus:9090/api/v1/write"
|
||||
|
||||
metadata_config {
|
||||
send_interval = "1m"
|
||||
}
|
||||
|
||||
queue_config {
|
||||
capacity = 10000
|
||||
max_shards = 10
|
||||
min_shards = 1
|
||||
max_samples_per_send = 5000
|
||||
}
|
||||
}
|
||||
}
|
||||
120
docker-compose-alloy/docker-compose.yml
Normal file
120
docker-compose-alloy/docker-compose.yml
Normal file
@@ -0,0 +1,120 @@
|
||||
services:
|
||||
|
||||
alloy:
|
||||
image: grafana/alloy:latest
|
||||
container_name: alloy
|
||||
command:
|
||||
- run
|
||||
- /etc/alloy/config.alloy
|
||||
- --server.http.listen-addr=0.0.0.0:12345
|
||||
- --storage.path=/var/lib/alloy/data
|
||||
volumes:
|
||||
- ./alloy-config.alloy:/etc/alloy/config.alloy:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC
|
||||
- "4318:4318" # OTLP HTTP
|
||||
- "12345:12345" # Alloy UI and metrics
|
||||
networks:
|
||||
- observ-net
|
||||
restart: unless-stopped
|
||||
privileged: true
|
||||
labels:
|
||||
logging: "alloy"
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- ./rules:/etc/prometheus/rules
|
||||
ports:
|
||||
- "9090:9090" # Interfaccia Web di Prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--web.enable-remote-write-receiver'
|
||||
- '--enable-feature=exemplar-storage'
|
||||
depends_on:
|
||||
- alloy
|
||||
networks:
|
||||
- observ-net
|
||||
labels:
|
||||
logging: "alloy"
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
container_name: grafana
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
volumes:
|
||||
- grafana-storage:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning
|
||||
depends_on:
|
||||
- prometheus
|
||||
networks:
|
||||
- observ-net
|
||||
labels:
|
||||
logging: "alloy"
|
||||
|
||||
loki:
|
||||
image: grafana/loki:latest
|
||||
container_name: loki
|
||||
user: "0"
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ./loki-config.yml:/etc/loki/local-config.yaml
|
||||
- ./loki-data:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- observ-net
|
||||
labels:
|
||||
logging: "alloy"
|
||||
|
||||
tempo:
|
||||
image: grafana/tempo:latest
|
||||
container_name: tempo
|
||||
user: "0"
|
||||
command: ["-config.file=/etc/tempo.yml"]
|
||||
volumes:
|
||||
- ./tempo-config.yml:/etc/tempo.yml
|
||||
- tempo-data:/tmp/tempo
|
||||
ports:
|
||||
- "3200:3200" # Tempo HTTP
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- observ-net
|
||||
labels:
|
||||
logging: "alloy"
|
||||
|
||||
pyroscope:
|
||||
image: grafana/pyroscope:latest
|
||||
container_name: pyroscope
|
||||
ports:
|
||||
- "4040:4040" # Pyroscope UI and API
|
||||
volumes:
|
||||
- pyroscope-data:/var/lib/pyroscope
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- observ-net
|
||||
labels:
|
||||
logging: "alloy"
|
||||
environment:
|
||||
- PYROSCOPE_LOG_LEVEL=info
|
||||
|
||||
|
||||
volumes:
|
||||
grafana-storage:
|
||||
tempo-data:
|
||||
pyroscope-data:
|
||||
|
||||
networks:
|
||||
observ-net:
|
||||
driver: bridge
|
||||
@@ -0,0 +1,74 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: true
|
||||
jsonData:
|
||||
timeInterval: 10s
|
||||
queryTimeout: 60s
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
isDefault: false
|
||||
editable: true
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: "trace_?id[\":]\\s*\"?([0-9a-fA-F]+)"
|
||||
name: TraceID
|
||||
url: "$${__value.raw}"
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
access: proxy
|
||||
url: http://tempo:3200
|
||||
isDefault: false
|
||||
editable: true
|
||||
jsonData:
|
||||
httpMethod: GET
|
||||
tracesToLogs:
|
||||
datasourceUid: loki
|
||||
mapTagNamesEnabled: true
|
||||
mappedTags:
|
||||
- key: service.name
|
||||
value: service_name
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
filterByTraceID: true
|
||||
filterBySpanID: false
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
tracesToProfiles:
|
||||
datasourceUid: pyroscope
|
||||
tags:
|
||||
- key: service.name
|
||||
value: service_name
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
search:
|
||||
hide: false
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
|
||||
- name: Pyroscope
|
||||
type: grafana-pyroscope-datasource
|
||||
uid: pyroscope
|
||||
access: proxy
|
||||
url: http://pyroscope:4040
|
||||
isDefault: false
|
||||
editable: true
|
||||
jsonData:
|
||||
keepCookies: []
|
||||
minStep: '15s'
|
||||
43
docker-compose-alloy/loki-config.yml
Normal file
43
docker-compose-alloy/loki-config.yml
Normal file
@@ -0,0 +1,43 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: 720h
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
volume_enabled: true
|
||||
22
docker-compose-alloy/prometheus.yml
Normal file
22
docker-compose-alloy/prometheus.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
global:
|
||||
scrape_interval: 10s
|
||||
evaluation_interval: 10s
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/rules/*.yml
|
||||
|
||||
scrape_configs:
|
||||
# Job 1: Monitora se Prometheus stesso è attivo
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
# Job 2: Scrape metriche di Alloy (self-monitoring)
|
||||
- job_name: 'alloy'
|
||||
static_configs:
|
||||
- targets: ['alloy:12345']
|
||||
labels:
|
||||
instance: 'alloy'
|
||||
|
||||
# NOTA: Le metriche dell'host (ex-Node Exporter) e OTLP arrivano via Remote Write da Alloy
|
||||
|
||||
15
docker-compose-alloy/rules/symon_rules.yml
Normal file
15
docker-compose-alloy/rules/symon_rules.yml
Normal file
@@ -0,0 +1,15 @@
|
||||
groups:
|
||||
- name: symon_process_metrics
|
||||
interval: 30s
|
||||
rules:
|
||||
- record: system_process_cpu_usage_percent:recent
|
||||
expr: |
|
||||
system_process_cpu_usage_percent
|
||||
and on(pid, name)
|
||||
(time() - timestamp(system_process_cpu_usage_percent) < 120)
|
||||
|
||||
- record: system_process_memory_usage_bytes:recent
|
||||
expr: |
|
||||
system_process_memory_usage_bytes
|
||||
and on(pid, name)
|
||||
(time() - timestamp(system_process_memory_usage_bytes) < 120)
|
||||
49
docker-compose-alloy/tempo-config.yml
Normal file
49
docker-compose-alloy/tempo-config.yml
Normal file
@@ -0,0 +1,49 @@
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 48h
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
wal:
|
||||
path: /tmp/tempo/wal
|
||||
local:
|
||||
path: /tmp/tempo/blocks
|
||||
|
||||
query_frontend:
|
||||
search:
|
||||
duration_slo: 5s
|
||||
throughput_bytes_slo: 1.073741824e+09
|
||||
trace_by_id:
|
||||
duration_slo: 5s
|
||||
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
source: tempo
|
||||
cluster: docker-compose
|
||||
storage:
|
||||
path: /tmp/tempo/generator/wal
|
||||
remote_write:
|
||||
- url: http://prometheus:9090/api/v1/write
|
||||
send_exemplars: true
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
metrics_generator:
|
||||
processors: [service-graphs, span-metrics]
|
||||
80
docker-compose-alloy/test-stack.sh
Executable file
80
docker-compose-alloy/test-stack.sh
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# Test script to verify the observability stack is running correctly
|
||||
|
||||
set -e
|
||||
|
||||
echo "🔍 Testing Symon OpenTelemetry Stack..."
|
||||
echo ""
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Test OTEL Collector gRPC endpoint
|
||||
echo -n "Testing OTEL Collector gRPC (port 4317)... "
|
||||
if nc -zv localhost 4317 2>&1 | grep -q "succeeded\|open"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test OTEL Collector HTTP endpoint
|
||||
echo -n "Testing OTEL Collector HTTP (port 4318)... "
|
||||
if nc -zv localhost 4318 2>&1 | grep -q "succeeded\|open"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test OTEL Collector metrics endpoint
|
||||
echo -n "Testing OTEL Collector metrics (port 8889)... "
|
||||
if curl -s http://localhost:8889/metrics > /dev/null; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test Prometheus
|
||||
echo -n "Testing Prometheus (port 9090)... "
|
||||
if curl -s http://localhost:9090/-/healthy | grep -q "Prometheus"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Test Prometheus targets
|
||||
echo -n "Testing Prometheus targets... "
|
||||
TARGETS=$(curl -s http://localhost:9090/api/v1/targets | grep -o '"health":"up"' | wc -l)
|
||||
if [ "$TARGETS" -gt 0 ]; then
|
||||
echo -e "${GREEN}✓ OK${NC} (${TARGETS} targets up)"
|
||||
else
|
||||
echo -e "${YELLOW}⚠ WARNING${NC} (no targets up yet - this is normal if just started)"
|
||||
fi
|
||||
|
||||
# Test Grafana
|
||||
echo -n "Testing Grafana (port 3000)... "
|
||||
if curl -s http://localhost:3000/api/health | grep -q "ok"; then
|
||||
echo -e "${GREEN}✓ OK${NC}"
|
||||
else
|
||||
echo -e "${RED}✗ FAILED${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo -e "${GREEN}✓ All tests passed!${NC}"
|
||||
echo ""
|
||||
echo "📊 Access points:"
|
||||
echo " - Prometheus: http://localhost:9090"
|
||||
echo " - Grafana: http://localhost:3000 (admin/admin)"
|
||||
echo " - OTEL Collector metrics: http://localhost:8889/metrics"
|
||||
echo ""
|
||||
echo "💡 Next steps:"
|
||||
echo " 1. Build Symon with: cargo build --release --features opentelemetry"
|
||||
echo " 2. Run in headless mode: ./target/release/btm --headless"
|
||||
echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"
|
||||
Reference in New Issue
Block a user