complete observability stak alloy

2025-11-08 17:57:52 +01:00
parent 4c2a5b6738
commit 869427c24d
11 changed files with 1142 additions and 0 deletions
--- a/docker-compose-alloy/METRICS.md
+++ b/docker-compose-alloy/METRICS.md
@@ -0,0 +1,227 @@
+# Symon OpenTelemetry Metrics Reference
+
+This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled.
+
+## System Metrics
+
+### CPU
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_cpu_usage_percent` | Gauge | `cpu_id` | CPU usage percentage per core |
+
+**Example:**
+```promql
+# Average CPU across all cores
+avg(system_cpu_usage_percent)
+
+# CPU usage for core 0
+system_cpu_usage_percent{cpu_id="0"}
+```
+
+### Memory
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_memory_usage_bytes` | Gauge | - | RAM memory currently in use |
+| `system_memory_total_bytes` | Gauge | - | Total RAM memory available |
+| `system_swap_usage_bytes` | Gauge | - | Swap memory currently in use |
+| `system_swap_total_bytes` | Gauge | - | Total swap memory available |
+
+**Example:**
+```promql
+# Memory usage percentage
+(system_memory_usage_bytes / system_memory_total_bytes) * 100
+
+# Available memory
+system_memory_total_bytes - system_memory_usage_bytes
+```
+
+### Network
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_network_rx_bytes_rate` | Gauge | `interface` | Network receive rate in bytes/sec |
+| `system_network_tx_bytes_rate` | Gauge | `interface` | Network transmit rate in bytes/sec |
+
+**Example:**
+```promql
+# Total network throughput
+sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
+
+# RX rate for specific interface
+system_network_rx_bytes_rate{interface="eth0"}
+```
+
+### Disk
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_disk_usage_bytes` | Gauge | `device`, `mount` | Disk space currently in use |
+| `system_disk_total_bytes` | Gauge | `device`, `mount` | Total disk space available |
+
+**Example:**
+```promql
+# Disk usage percentage
+(system_disk_usage_bytes / system_disk_total_bytes) * 100
+
+# Free disk space
+system_disk_total_bytes - system_disk_usage_bytes
+```
+
+### Temperature
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_temperature_celsius` | Gauge | `sensor` | Temperature readings in Celsius |
+
+**Example:**
+```promql
+# Average temperature across all sensors
+avg(system_temperature_celsius)
+
+# Maximum temperature
+max(system_temperature_celsius)
+```
+
+## Process Metrics
+
+| Metric Name | Type | Labels | Description |
+|------------|------|--------|-------------|
+| `system_process_cpu_usage_percent` | Gauge | `name`, `pid` | CPU usage percentage per process |
+| `system_process_memory_usage_bytes` | Gauge | `name`, `pid` | Memory usage in bytes per process |
+| `system_process_count` | Gauge | - | Total number of processes |
+
+**Example:**
+```promql
+# Top 10 processes by CPU
+topk(10, system_process_cpu_usage_percent)
+
+# Top 10 processes by memory
+topk(10, system_process_memory_usage_bytes)
+
+# Total memory used by all Chrome processes
+sum(system_process_memory_usage_bytes{name=~".*chrome.*"})
+```
+
+## Recording Rules
+
+The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`):
+
+| Rule Name | Expression | Description |
+|-----------|------------|-------------|
+| `system_process_cpu_usage_percent:recent` | Recent process CPU metrics | Filters out stale process data (>2 min old) |
+| `system_process_memory_usage_bytes:recent` | Recent process memory metrics | Filters out stale process data (>2 min old) |
+
+**Example:**
+```promql
+# Query only recent process data
+topk(10, system_process_cpu_usage_percent:recent)
+```
+
+## Common Queries
+
+### System Health
+
+```promql
+# Overall system CPU usage
+avg(system_cpu_usage_percent)
+
+# Memory pressure (>80% is high)
+(system_memory_usage_bytes / system_memory_total_bytes) * 100
+
+# Disk pressure (>90% is critical)
+(system_disk_usage_bytes / system_disk_total_bytes) * 100
+```
+
+### Resource Hogs
+
+```promql
+# Top CPU consumers
+topk(5, system_process_cpu_usage_percent)
+
+# Top memory consumers
+topk(5, system_process_memory_usage_bytes)
+
+# Processes using >1GB memory
+system_process_memory_usage_bytes > 1073741824
+```
+
+### Network Analysis
+
+```promql
+# Total network traffic (RX + TX)
+sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate)
+
+# Network traffic by interface
+sum by (interface) (system_network_rx_bytes_rate + system_network_tx_bytes_rate)
+
+# Interfaces with high RX rate (>10MB/s)
+system_network_rx_bytes_rate > 10485760
+```
+
+## Alerting Examples
+
+### Sample Prometheus Alert Rules
+
+```yaml
+groups:
+  - name: Symon_alerts
+    interval: 30s
+    rules:
+      - alert: HighCPUUsage
+        expr: avg(system_cpu_usage_percent) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage detected"
+          description: "Average CPU usage is {{ $value }}%"
+
+      - alert: HighMemoryUsage
+        expr: (system_memory_usage_bytes / system_memory_total_bytes) * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage detected"
+          description: "Memory usage is {{ $value }}%"
+
+      - alert: DiskAlmostFull
+        expr: (system_disk_usage_bytes / system_disk_total_bytes) * 100 > 90
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Disk {{ $labels.mount }} almost full"
+          description: "Disk usage is {{ $value }}% on {{ $labels.mount }}"
+```
+
+## Label Reference
+
+| Label | Used In | Description |
+|-------|---------|-------------|
+| `cpu_id` | CPU metrics | CPU core identifier (0, 1, 2, ...) |
+| `interface` | Network metrics | Network interface name (eth0, wlan0, ...) |
+| `device` | Disk metrics | Device name (/dev/sda1, ...) |
+| `mount` | Disk metrics | Mount point (/, /home, ...) |
+| `sensor` | Temperature | Temperature sensor name |
+| `name` | Process metrics | Process name |
+| `pid` | Process metrics | Process ID |
+| `exported_job` | All | Always "Symon-system-monitor" |
+| `otel_scope_name` | All | Always "Symon-system-monitor" |
+
+## Data Retention
+
+By default, Prometheus stores metrics for 15 days. You can adjust this in the Prometheus configuration:
+
+```yaml
+# In prometheus.yml
+global:
+  retention_time: 30d  # Keep data for 30 days
+```
+
+For long-term storage, consider using:
+- **TimescaleDB** (see `docker-compose-timescale.yml.ko`)
+- **Thanos** for multi-cluster metrics
+- **Cortex** for horizontally scalable storage
--- a/docker-compose-alloy/README-ALLOY.md
+++ b/docker-compose-alloy/README-ALLOY.md
@@ -0,0 +1,148 @@
+# Stack Observability con Grafana Alloy
+
+Questa directory contiene uno stack di observability **semplificato** che usa **Grafana Alloy** al posto di tre componenti separati.
+
+## Differenze con docker-compose originale
+
+### Setup Originale (7 servizi)
+```
+- Promtail       → Raccolta log dai container
+- Node Exporter  → Metriche sistema host
+- OTEL Collector → Ricezione telemetria OTLP
+- Prometheus     → Storage metriche
+- Loki           → Storage log
+- Tempo          → Storage traces
+- Grafana        → Visualizzazione
+```
+
+### Setup con Alloy (5 servizi) ⭐
+```
+- Alloy          → Raccolta log + metriche host + OTLP (tutto in uno!)
+- Prometheus     → Storage metriche
+- Loki           → Storage log
+- Tempo          → Storage traces
+- Grafana        → Visualizzazione
+ Pyroscope      → Continuous profiling
+```
+
+## Cosa fa Alloy
+
+Grafana Alloy sostituisce **3 servizi** in uno:
+
+| Funzione | Prima | Dopo |
+|----------|-------|------|
+| **Raccolta log Docker** | Promtail | Alloy |
+| **Metriche sistema host** | Node Exporter | Alloy (unix exporter integrato) |
+| **Ricezione OTLP** | OTEL Collector | Alloy (otelcol receiver) |
+
+### Vantaggi di Alloy:
+- ✅ **Meno container** da gestire (5 invece di 7)
+- ✅ **Configurazione unificata** in un solo file (alloy-config.alloy)
+- ✅ **Meno risorse** consumate (CPU/RAM)
+- ✅ **UI integrata** per debugging (http://localhost:12345)
+- ✅ **Più moderno** (linguaggio River invece di YAML)
+- ✅ **Hot reload** della configurazione
+
+### Come funziona:
+
+1. **Logs**: Alloy scopre i container con label `logging=alloy` e invia i log a Loki
+2. **Metrics**: Alloy raccoglie metriche host e le invia a Prometheus via Remote Write
+3. **Traces**: Alloy riceve traces OTLP e le inoltra a Tempo
+
+## Porte esposte
+
+| Servizio | Porta | Descrizione |
+|----------|-------|-------------|
+| Alloy | 12345 | UI e metriche self-monitoring |
+| Alloy | 4317 | OTLP gRPC (traces/metrics) |
+| Alloy | 4318 | OTLP HTTP (traces/metrics) |
+| Prometheus | 9090 | Query e UI |
+| Loki | 3100 | Push/Query API |
+| Tempo | 3200 | Query API |
+| Pyroscope | 4040 | Profiling UI e API |
+| Grafana | 3000 | Dashboard |
+
+## Come usare
+
+### Avviare lo stack:
+```bash
+cd docker-compose-alloy
+docker-compose up -d
+```
+
+### Verificare Alloy:
+```bash
+# UI di Alloy (molto utile per debugging!)
+open http://localhost:12345
+
+# Vedere i target scoperti
+curl http://localhost:12345/api/v0/component/discovery.docker.containers/targets
+
+# Vedere la configurazione caricata
+curl http://localhost:12345/api/v0/web/components
+```
+
+### Accedere ai servizi:
+- **Grafana**: http://localhost:3000 (admin/admin)
+- **Alloy UI**: http://localhost:12345
+- **Prometheus**: http://localhost:9090
+- **Pyroscope**: http://localhost:4040
+
+## Configurazione
+
+### File principale:
+- **alloy-config.alloy**: Configurazione unificata di Alloy (sostituisce promtail-config.yml e otel-collector-config.yml)
+- **prometheus.yml**: Configurazione Prometheus (semplificata, Alloy fa remote write)
+- **loki-config.yml**: Configurazione Loki (invariata)
+- **tempo-config.yml**: Configurazione Tempo (invariata)
+
+### Modificare la configurazione di Alloy:
+
+1. Modifica `alloy-config.alloy`
+2. Riavvia il container: `docker-compose restart alloy`
+3. Verifica la configurazione: http://localhost:12345
+
+## Metriche disponibili
+
+Alloy espone le stesse metriche di Node Exporter con prefisso `node_*`:
+```promql
+# CPU usage
+rate(node_cpu_seconds_total[5m])
+
+# Memory
+node_memory_MemAvailable_bytes
+
+# Disk
+node_filesystem_avail_bytes
+```
+
+## Troubleshooting
+
+### Verificare che Alloy stia raccogliendo log:
+```bash
+docker-compose logs alloy | grep loki
+```
+
+### Verificare che Alloy stia inviando metriche a Prometheus:
+```bash
+curl http://localhost:9090/api/v1/label/__name__/values | grep node_
+```
+
+### Vedere i componenti attivi in Alloy:
+```bash
+curl http://localhost:12345/api/v0/web/components | jq
+```
+
+## Migrazione dalla versione originale
+
+Se hai già la versione con Promtail/Node-Exporter/OTEL:
+
+1. Ferma lo stack vecchio: `cd ../docker-compose && docker-compose down`
+2. Avvia il nuovo: `cd ../docker-compose-alloy && docker-compose up -d`
+3. I dati storici in Prometheus/Loki/Tempo sono preservati nei volumi Docker
+
+## Link utili
+
+- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/)
+- [Alloy Configuration Reference](https://grafana.com/docs/alloy/latest/reference/)
+- [River Language](https://grafana.com/docs/alloy/latest/concepts/configuration-syntax/)
--- a/docker-compose-alloy/README.md
+++ b/docker-compose-alloy/README.md
@@ -0,0 +1,195 @@
+# Symon OpenTelemetry Docker Compose Setup
+
+This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry.
+
+## Architecture
+
+The stack includes:
+
+1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol
+2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector
+3. **Grafana** - Visualizes metrics from Prometheus
+
+```
+Symon (with --headless flag)
+    ↓ (OTLP/gRPC on port 4317)
+OpenTelemetry Collector
+    ↓ (Prometheus scrape on port 8889)
+Prometheus
+    ↓ (Query on port 9090)
+Grafana (accessible on port 3000)
+```
+
+## Quick Start
+
+### 1. Start the observability stack
+
+```bash
+cd docker-compose
+docker-compose up -d
+```
+
+This will start:
+- OpenTelemetry Collector on ports 4317 (gRPC), 4318 (HTTP), 8889 (metrics)
+- Prometheus on port 9090
+- Grafana on port 3000
+
+### 2. Build Symon with OpenTelemetry support
+
+```bash
+cd ..
+cargo build --release --features opentelemetry
+```
+
+### 3. Create a configuration file
+
+Create a `Symon-config.toml` file:
+
+```toml
+[opentelemetry]
+enabled = true
+endpoint = "http://localhost:4317"
+service_name = "Symon-system-monitor"
+export_interval_ms = 5000
+
+[opentelemetry.metrics]
+cpu = true
+memory = true
+network = true
+disk = true
+processes = true
+temperature = true
+gpu = true
+```
+
+### 4. Run Symon in headless mode
+
+```bash
+./target/release/btm --config Symon-config.toml --headless
+```
+
+Or without config file:
+
+```bash
+OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \
+./target/release/btm --headless
+```
+
+### 5. Access the dashboards
+
+- **Prometheus**: http://localhost:9090
+- **Grafana**: http://localhost:3000 (username: `admin`, password: `admin`)
+
+## Configuration Files
+
+### otel-collector-config.yml
+
+Configures the OpenTelemetry Collector to:
+- Receive OTLP data on ports 4317 (gRPC) and 4318 (HTTP)
+- Export metrics in Prometheus format on port 9090
+- Debug log all received data
+
+### prometheus.yml
+
+Configures Prometheus to:
+- Scrape metrics from the OTEL Collector every 10 seconds
+- Load alerting rules from `rules/Symon_rules.yml`
+
+### rules/Symon_rules.yml
+
+Contains Prometheus recording rules for Symon metrics, including:
+- Recent process CPU usage metrics
+- Recent process memory usage metrics
+
+## Viewing Metrics in Prometheus
+
+1. Go to http://localhost:9090
+2. Click on "Graph"
+3. Try these example queries:
+
+```promql
+# CPU usage by core
+system_cpu_usage_percent
+
+# Memory usage
+system_memory_usage_bytes
+
+# Network RX/TX
+system_network_rx_bytes
+system_network_tx_bytes
+
+# Disk usage
+system_disk_usage_bytes
+
+# Top processes by CPU
+topk(10, system_process_cpu_usage_percent)
+
+# Top processes by memory
+topk(10, system_process_memory_usage_bytes)
+```
+
+## Grafana Configuration
+
+Grafana is automatically configured with:
+- **Prometheus data source** (http://prometheus:9090) - pre-configured
+- **Symon System Overview dashboard** - pre-loaded
+
+To access:
+1. Go to http://localhost:3000 (username: `admin`, password: `admin`)
+2. Navigate to Dashboards → Browse → "Symon System Overview"
+
+The dashboard includes:
+- CPU usage by core
+- Memory usage (RAM/Swap)
+- Network traffic
+- Disk usage
+- Top 10 processes by CPU
+- Top 10 processes by Memory
+
+## Stopping the Stack
+
+```bash
+docker-compose down
+```
+
+To also remove volumes:
+
+```bash
+docker-compose down -v
+```
+
+## Troubleshooting
+
+### Symon not sending metrics
+
+Check the OTEL Collector logs:
+```bash
+docker-compose logs -f otel-collector
+```
+
+You should see messages about receiving metrics.
+
+### Prometheus not scraping
+
+1. Check Prometheus targets at http://localhost:9090/targets
+2. The `otel-collector` target should be UP
+
+### No data in Grafana
+
+1. Verify Prometheus data source is configured correctly
+2. Check that Prometheus has data by querying directly
+3. Ensure your time range in Grafana includes when Symon was running
+
+## Advanced Configuration
+
+### Using with TimescaleDB (optional)
+
+A TimescaleDB configuration file is available as `docker-compose-timescale.yml.ko` for long-term storage of metrics. Rename it to include it in your stack.
+
+### Custom Prometheus Rules
+
+Edit `rules/Symon_rules.yml` to add custom recording or alerting rules.
+
+### OTEL Collector Sampling
+
+Edit `otel-collector-config.yml` to adjust the batch processor settings for different performance characteristics.
--- a/docker-compose-alloy/alloy-config.alloy
+++ b/docker-compose-alloy/alloy-config.alloy
@@ -0,0 +1,169 @@
+// Grafana Alloy Configuration
+// Sostituisce: Promtail + Node Exporter + OTEL Collector
+
+// ============================================================================
+// LOGGING - Raccolta log dai container Docker (sostituisce Promtail)
+// ============================================================================
+
+// Scopri container Docker con label logging=promtail
+discovery.docker "containers" {
+  host = "unix:///var/run/docker.sock"
+
+  filter {
+    name   = "label"
+    values = ["logging=promtail"]
+  }
+
+  refresh_interval = "5s"
+}
+
+// Scrape log dai container scoperti
+loki.source.docker "containers" {
+  host       = "unix:///var/run/docker.sock"
+  targets    = discovery.docker.containers.targets
+  forward_to = [loki.relabel.docker.receiver]
+}
+
+// Relabeling per aggiungere label ai log
+loki.relabel "docker" {
+  forward_to = [loki.write.default.receiver]
+
+  rule {
+    source_labels = ["__meta_docker_container_name"]
+    regex         = "/(.*)"
+    target_label  = "container"
+  }
+
+  rule {
+    source_labels = ["__meta_docker_container_label_logging_jobname"]
+    target_label  = "job"
+  }
+}
+
+// Scrivi i log a Loki
+loki.write "default" {
+  endpoint {
+    url = "http://loki:3100/loki/api/v1/push"
+  }
+
+  external_labels = {
+    environment = "production",
+    cluster     = "myapp-cluster",
+  }
+}
+
+// ============================================================================
+// METRICS - Metriche sistema host (sostituisce Node Exporter)
+// ============================================================================
+
+// Scraping metriche locali dell'host
+prometheus.exporter.unix "host" {
+  // Raccoglie metriche del sistema operativo
+  set_collectors = [
+    "cpu",
+    "loadavg",
+    "meminfo",
+    "netdev",
+    "diskstats",
+    "filesystem",
+    "uname",
+    "time",
+  ]
+}
+
+// Scrape delle metriche raccolte
+prometheus.scrape "host_metrics" {
+  targets    = prometheus.exporter.unix.host.targets
+  forward_to = [prometheus.remote_write.default.receiver]
+
+  scrape_interval = "10s"
+
+  clustering {
+    enabled = false
+  }
+}
+
+// Scraping self-monitoring di Alloy
+prometheus.scrape "alloy_metrics" {
+  targets = [{
+    __address__ = "localhost:12345",
+  }]
+  forward_to = [prometheus.remote_write.default.receiver]
+
+  scrape_interval = "10s"
+}
+
+// ============================================================================
+// TRACES - Ricezione traces OTLP (sostituisce OTEL Collector)
+// ============================================================================
+
+// Ricevi traces via OTLP gRPC
+otelcol.receiver.otlp "default" {
+  grpc {
+    endpoint = "0.0.0.0:4317"
+  }
+
+  http {
+    endpoint = "0.0.0.0:4318"
+  }
+
+  output {
+    metrics = [otelcol.processor.batch.default.input]
+    logs    = [otelcol.processor.batch.default.input]
+    traces  = [otelcol.processor.batch.default.input]
+  }
+}
+
+// Batch processor per ottimizzare l'invio
+otelcol.processor.batch "default" {
+  timeout              = "10s"
+  send_batch_size      = 10000
+  send_batch_max_size  = 11000
+
+  output {
+    metrics = [otelcol.exporter.prometheus.default.input]
+    traces  = [otelcol.exporter.otlp.tempo.input]
+    logs    = [otelcol.exporter.loki.default.input]
+  }
+}
+
+// Esporta metriche OTLP a Prometheus
+otelcol.exporter.prometheus "default" {
+  forward_to = [prometheus.remote_write.default.receiver]
+}
+
+// Esporta traces a Tempo
+otelcol.exporter.otlp "tempo" {
+  client {
+    endpoint = "tempo:4317"
+    tls {
+      insecure = true
+    }
+  }
+}
+
+// Esporta log OTLP a Loki
+otelcol.exporter.loki "default" {
+  forward_to = [loki.write.default.receiver]
+}
+
+// ============================================================================
+// REMOTE WRITE - Invia metriche a Prometheus
+// ============================================================================
+
+prometheus.remote_write "default" {
+  endpoint {
+    url = "http://prometheus:9090/api/v1/write"
+
+    metadata_config {
+      send_interval = "1m"
+    }
+
+    queue_config {
+      capacity          = 10000
+      max_shards        = 10
+      min_shards        = 1
+      max_samples_per_send = 5000
+    }
+  }
+}
--- a/docker-compose-alloy/docker-compose.yml
+++ b/docker-compose-alloy/docker-compose.yml
@@ -0,0 +1,120 @@
+services:
+
+  alloy:
+    image: grafana/alloy:latest
+    container_name: alloy
+    command:
+      - run
+      - /etc/alloy/config.alloy
+      - --server.http.listen-addr=0.0.0.0:12345
+      - --storage.path=/var/lib/alloy/data
+    volumes:
+      - ./alloy-config.alloy:/etc/alloy/config.alloy:ro
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    ports:
+      - "4317:4317"   # OTLP gRPC
+      - "4318:4318"   # OTLP HTTP
+      - "12345:12345" # Alloy UI and metrics
+    networks:
+      - observ-net
+    restart: unless-stopped
+    privileged: true
+    labels:
+      logging: "alloy"
+
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: prometheus
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml
+      - ./rules:/etc/prometheus/rules
+    ports:
+      - "9090:9090" # Interfaccia Web di Prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--web.enable-remote-write-receiver'
+      - '--enable-feature=exemplar-storage'
+    depends_on:
+      - alloy
+    networks:
+      - observ-net
+    labels:
+      logging: "alloy"
+
+  grafana:
+    image: grafana/grafana:latest
+    container_name: grafana
+    ports:
+      - "3000:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_SECURITY_ADMIN_USER=admin
+    volumes:
+      - grafana-storage:/var/lib/grafana
+      - ./grafana/provisioning:/etc/grafana/provisioning
+    depends_on:
+      - prometheus
+    networks:
+      - observ-net
+    labels:
+      logging: "alloy"
+
+  loki:
+    image: grafana/loki:latest
+    container_name: loki
+    user: "0"
+    ports:
+      - "3100:3100"
+    volumes:
+      - ./loki-config.yml:/etc/loki/local-config.yaml
+      - ./loki-data:/loki
+    command: -config.file=/etc/loki/local-config.yaml
+    restart: unless-stopped
+    networks:
+      - observ-net
+    labels:
+      logging: "alloy"
+
+  tempo:
+    image: grafana/tempo:latest
+    container_name: tempo
+    user: "0"
+    command: ["-config.file=/etc/tempo.yml"]
+    volumes:
+      - ./tempo-config.yml:/etc/tempo.yml
+      - tempo-data:/tmp/tempo
+    ports:
+      - "3200:3200"   # Tempo HTTP
+    restart: unless-stopped
+    networks:
+      - observ-net
+    labels:
+      logging: "alloy"
+
+  pyroscope:
+    image: grafana/pyroscope:latest
+    container_name: pyroscope
+    ports:
+      - "4040:4040"   # Pyroscope UI and API
+    volumes:
+      - pyroscope-data:/var/lib/pyroscope
+    restart: unless-stopped
+    networks:
+      - observ-net
+    labels:
+      logging: "alloy"
+    environment:
+      - PYROSCOPE_LOG_LEVEL=info
+
+
+volumes:
+  grafana-storage:
+  tempo-data:
+  pyroscope-data:
+
+networks:
+  observ-net:
+    driver: bridge
--- a/docker-compose-alloy/grafana/provisioning/datasources/observability_stack.yml
+++ b/docker-compose-alloy/grafana/provisioning/datasources/observability_stack.yml
@@ -0,0 +1,74 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: 10s
+      queryTimeout: 60s
+
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    isDefault: false
+    editable: true
+    jsonData:
+      maxLines: 1000
+      derivedFields:
+        - datasourceUid: tempo
+          matcherRegex: "trace_?id[\":]\\s*\"?([0-9a-fA-F]+)"
+          name: TraceID
+          url: "$${__value.raw}"
+
+  - name: Tempo
+    type: tempo
+    uid: tempo
+    access: proxy
+    url: http://tempo:3200
+    isDefault: false
+    editable: true
+    jsonData:
+      httpMethod: GET
+      tracesToLogs:
+        datasourceUid: loki
+        mapTagNamesEnabled: true
+        mappedTags:
+          - key: service.name
+            value: service_name
+        spanStartTimeShift: '-1h'
+        spanEndTimeShift: '1h'
+        filterByTraceID: true
+        filterBySpanID: false
+      tracesToMetrics:
+        datasourceUid: prometheus
+        spanStartTimeShift: '-1h'
+        spanEndTimeShift: '1h'
+      tracesToProfiles:
+        datasourceUid: pyroscope
+        tags:
+          - key: service.name
+            value: service_name
+      serviceMap:
+        datasourceUid: prometheus
+      nodeGraph:
+        enabled: true
+      search:
+        hide: false
+      lokiSearch:
+        datasourceUid: loki
+
+  - name: Pyroscope
+    type: grafana-pyroscope-datasource
+    uid: pyroscope
+    access: proxy
+    url: http://pyroscope:4040
+    isDefault: false
+    editable: true
+    jsonData:
+      keepCookies: []
+      minStep: '15s'
--- a/docker-compose-alloy/loki-config.yml
+++ b/docker-compose-alloy/loki-config.yml
@@ -0,0 +1,43 @@
+auth_enabled: false
+
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+
+common:
+  instance_addr: 127.0.0.1
+  path_prefix: /loki
+  storage:
+    filesystem:
+      chunks_directory: /loki/chunks
+      rules_directory: /loki/rules
+  replication_factor: 1
+  ring:
+    kvstore:
+      store: inmemory
+
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: tsdb
+      object_store: filesystem
+      schema: v13
+      index:
+        prefix: index_
+        period: 24h
+
+ruler:
+  alertmanager_url: http://localhost:9093
+
+limits_config:
+  retention_period: 720h
+  ingestion_rate_mb: 10
+  ingestion_burst_size_mb: 20
+  volume_enabled: true
--- a/docker-compose-alloy/prometheus.yml
+++ b/docker-compose-alloy/prometheus.yml
@@ -0,0 +1,22 @@
+global:
+  scrape_interval: 10s
+  evaluation_interval: 10s
+
+rule_files:
+  - /etc/prometheus/rules/*.yml
+
+scrape_configs:
+  # Job 1: Monitora se Prometheus stesso è attivo
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # Job 2: Scrape metriche di Alloy (self-monitoring)
+  - job_name: 'alloy'
+    static_configs:
+      - targets: ['alloy:12345']
+        labels:
+          instance: 'alloy'
+
+# NOTA: Le metriche dell'host (ex-Node Exporter) e OTLP arrivano via Remote Write da Alloy 
+
--- a/docker-compose-alloy/rules/symon_rules.yml
+++ b/docker-compose-alloy/rules/symon_rules.yml
@@ -0,0 +1,15 @@
+groups:
+  - name: symon_process_metrics
+    interval: 30s
+    rules:
+      - record: system_process_cpu_usage_percent:recent
+        expr: |
+          system_process_cpu_usage_percent
+            and on(pid, name)
+            (time() - timestamp(system_process_cpu_usage_percent) < 120)
+      
+      - record: system_process_memory_usage_bytes:recent
+        expr: |
+          system_process_memory_usage_bytes
+            and on(pid, name)
+            (time() - timestamp(system_process_memory_usage_bytes) < 120)
--- a/docker-compose-alloy/tempo-config.yml
+++ b/docker-compose-alloy/tempo-config.yml
@@ -0,0 +1,49 @@
+server:
+  http_listen_port: 3200
+
+distributor:
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: 0.0.0.0:4317
+        http:
+          endpoint: 0.0.0.0:4318
+
+ingester:
+  max_block_duration: 5m
+
+compactor:
+  compaction:
+    block_retention: 48h
+
+storage:
+  trace:
+    backend: local
+    wal:
+      path: /tmp/tempo/wal
+    local:
+      path: /tmp/tempo/blocks
+
+query_frontend:
+  search:
+    duration_slo: 5s
+    throughput_bytes_slo: 1.073741824e+09
+  trace_by_id:
+    duration_slo: 5s
+
+metrics_generator:
+  registry:
+    external_labels:
+      source: tempo
+      cluster: docker-compose
+  storage:
+    path: /tmp/tempo/generator/wal
+    remote_write:
+      - url: http://prometheus:9090/api/v1/write
+        send_exemplars: true
+
+overrides:
+  defaults:
+    metrics_generator:
+      processors: [service-graphs, span-metrics]
--- a/docker-compose-alloy/test-stack.sh
+++ b/docker-compose-alloy/test-stack.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+# Test script to verify the observability stack is running correctly
+
+set -e
+
+echo "🔍 Testing Symon OpenTelemetry Stack..."
+echo ""
+
+# Colors
+GREEN='\033[0;32m'
+RED='\033[0;31m'
+YELLOW='\033[1;33m'
+NC='\033[0m' # No Color
+
+# Test OTEL Collector gRPC endpoint
+echo -n "Testing OTEL Collector gRPC (port 4317)... "
+if nc -zv localhost 4317 2>&1 | grep -q "succeeded\|open"; then
+    echo -e "${GREEN}✓ OK${NC}"
+else
+    echo -e "${RED}✗ FAILED${NC}"
+    exit 1
+fi
+
+# Test OTEL Collector HTTP endpoint
+echo -n "Testing OTEL Collector HTTP (port 4318)... "
+if nc -zv localhost 4318 2>&1 | grep -q "succeeded\|open"; then
+    echo -e "${GREEN}✓ OK${NC}"
+else
+    echo -e "${RED}✗ FAILED${NC}"
+    exit 1
+fi
+
+# Test OTEL Collector metrics endpoint
+echo -n "Testing OTEL Collector metrics (port 8889)... "
+if curl -s http://localhost:8889/metrics > /dev/null; then
+    echo -e "${GREEN}✓ OK${NC}"
+else
+    echo -e "${RED}✗ FAILED${NC}"
+    exit 1
+fi
+
+# Test Prometheus
+echo -n "Testing Prometheus (port 9090)... "
+if curl -s http://localhost:9090/-/healthy | grep -q "Prometheus"; then
+    echo -e "${GREEN}✓ OK${NC}"
+else
+    echo -e "${RED}✗ FAILED${NC}"
+    exit 1
+fi
+
+# Test Prometheus targets
+echo -n "Testing Prometheus targets... "
+TARGETS=$(curl -s http://localhost:9090/api/v1/targets | grep -o '"health":"up"' | wc -l)
+if [ "$TARGETS" -gt 0 ]; then
+    echo -e "${GREEN}✓ OK${NC} (${TARGETS} targets up)"
+else
+    echo -e "${YELLOW}⚠ WARNING${NC} (no targets up yet - this is normal if just started)"
+fi
+
+# Test Grafana
+echo -n "Testing Grafana (port 3000)... "
+if curl -s http://localhost:3000/api/health | grep -q "ok"; then
+    echo -e "${GREEN}✓ OK${NC}"
+else
+    echo -e "${RED}✗ FAILED${NC}"
+    exit 1
+fi
+
+echo ""
+echo -e "${GREEN}✓ All tests passed!${NC}"
+echo ""
+echo "📊 Access points:"
+echo "   - Prometheus: http://localhost:9090"
+echo "   - Grafana: http://localhost:3000 (admin/admin)"
+echo "   - OTEL Collector metrics: http://localhost:8889/metrics"
+echo ""
+echo "💡 Next steps:"
+echo "   1. Build Symon with: cargo build --release --features opentelemetry"
+echo "   2. Run in headless mode: ./target/release/btm --headless"
+echo "   3. Check metrics in Prometheus: http://localhost:9090/graph"