diff --git a/docker-compose-alloy/METRICS.md b/docker-compose-alloy/METRICS.md new file mode 100644 index 0000000..8db82b0 --- /dev/null +++ b/docker-compose-alloy/METRICS.md @@ -0,0 +1,227 @@ +# Symon OpenTelemetry Metrics Reference + +This document lists all metrics exported by Symon when running with the `opentelemetry` feature enabled. + +## System Metrics + +### CPU + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_cpu_usage_percent` | Gauge | `cpu_id` | CPU usage percentage per core | + +**Example:** +```promql +# Average CPU across all cores +avg(system_cpu_usage_percent) + +# CPU usage for core 0 +system_cpu_usage_percent{cpu_id="0"} +``` + +### Memory + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_memory_usage_bytes` | Gauge | - | RAM memory currently in use | +| `system_memory_total_bytes` | Gauge | - | Total RAM memory available | +| `system_swap_usage_bytes` | Gauge | - | Swap memory currently in use | +| `system_swap_total_bytes` | Gauge | - | Total swap memory available | + +**Example:** +```promql +# Memory usage percentage +(system_memory_usage_bytes / system_memory_total_bytes) * 100 + +# Available memory +system_memory_total_bytes - system_memory_usage_bytes +``` + +### Network + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_network_rx_bytes_rate` | Gauge | `interface` | Network receive rate in bytes/sec | +| `system_network_tx_bytes_rate` | Gauge | `interface` | Network transmit rate in bytes/sec | + +**Example:** +```promql +# Total network throughput +sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate) + +# RX rate for specific interface +system_network_rx_bytes_rate{interface="eth0"} +``` + +### Disk + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_disk_usage_bytes` | Gauge | `device`, `mount` | Disk space currently in use | +| `system_disk_total_bytes` | Gauge | `device`, `mount` | Total disk space available | + +**Example:** +```promql +# Disk usage percentage +(system_disk_usage_bytes / system_disk_total_bytes) * 100 + +# Free disk space +system_disk_total_bytes - system_disk_usage_bytes +``` + +### Temperature + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_temperature_celsius` | Gauge | `sensor` | Temperature readings in Celsius | + +**Example:** +```promql +# Average temperature across all sensors +avg(system_temperature_celsius) + +# Maximum temperature +max(system_temperature_celsius) +``` + +## Process Metrics + +| Metric Name | Type | Labels | Description | +|------------|------|--------|-------------| +| `system_process_cpu_usage_percent` | Gauge | `name`, `pid` | CPU usage percentage per process | +| `system_process_memory_usage_bytes` | Gauge | `name`, `pid` | Memory usage in bytes per process | +| `system_process_count` | Gauge | - | Total number of processes | + +**Example:** +```promql +# Top 10 processes by CPU +topk(10, system_process_cpu_usage_percent) + +# Top 10 processes by memory +topk(10, system_process_memory_usage_bytes) + +# Total memory used by all Chrome processes +sum(system_process_memory_usage_bytes{name=~".*chrome.*"}) +``` + +## Recording Rules + +The following recording rules are pre-configured in Prometheus (see `rules/Symon_rules.yml`): + +| Rule Name | Expression | Description | +|-----------|------------|-------------| +| `system_process_cpu_usage_percent:recent` | Recent process CPU metrics | Filters out stale process data (>2 min old) | +| `system_process_memory_usage_bytes:recent` | Recent process memory metrics | Filters out stale process data (>2 min old) | + +**Example:** +```promql +# Query only recent process data +topk(10, system_process_cpu_usage_percent:recent) +``` + +## Common Queries + +### System Health + +```promql +# Overall system CPU usage +avg(system_cpu_usage_percent) + +# Memory pressure (>80% is high) +(system_memory_usage_bytes / system_memory_total_bytes) * 100 + +# Disk pressure (>90% is critical) +(system_disk_usage_bytes / system_disk_total_bytes) * 100 +``` + +### Resource Hogs + +```promql +# Top CPU consumers +topk(5, system_process_cpu_usage_percent) + +# Top memory consumers +topk(5, system_process_memory_usage_bytes) + +# Processes using >1GB memory +system_process_memory_usage_bytes > 1073741824 +``` + +### Network Analysis + +```promql +# Total network traffic (RX + TX) +sum(system_network_rx_bytes_rate) + sum(system_network_tx_bytes_rate) + +# Network traffic by interface +sum by (interface) (system_network_rx_bytes_rate + system_network_tx_bytes_rate) + +# Interfaces with high RX rate (>10MB/s) +system_network_rx_bytes_rate > 10485760 +``` + +## Alerting Examples + +### Sample Prometheus Alert Rules + +```yaml +groups: + - name: Symon_alerts + interval: 30s + rules: + - alert: HighCPUUsage + expr: avg(system_cpu_usage_percent) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage detected" + description: "Average CPU usage is {{ $value }}%" + + - alert: HighMemoryUsage + expr: (system_memory_usage_bytes / system_memory_total_bytes) * 100 > 90 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage detected" + description: "Memory usage is {{ $value }}%" + + - alert: DiskAlmostFull + expr: (system_disk_usage_bytes / system_disk_total_bytes) * 100 > 90 + for: 10m + labels: + severity: critical + annotations: + summary: "Disk {{ $labels.mount }} almost full" + description: "Disk usage is {{ $value }}% on {{ $labels.mount }}" +``` + +## Label Reference + +| Label | Used In | Description | +|-------|---------|-------------| +| `cpu_id` | CPU metrics | CPU core identifier (0, 1, 2, ...) | +| `interface` | Network metrics | Network interface name (eth0, wlan0, ...) | +| `device` | Disk metrics | Device name (/dev/sda1, ...) | +| `mount` | Disk metrics | Mount point (/, /home, ...) | +| `sensor` | Temperature | Temperature sensor name | +| `name` | Process metrics | Process name | +| `pid` | Process metrics | Process ID | +| `exported_job` | All | Always "Symon-system-monitor" | +| `otel_scope_name` | All | Always "Symon-system-monitor" | + +## Data Retention + +By default, Prometheus stores metrics for 15 days. You can adjust this in the Prometheus configuration: + +```yaml +# In prometheus.yml +global: + retention_time: 30d # Keep data for 30 days +``` + +For long-term storage, consider using: +- **TimescaleDB** (see `docker-compose-timescale.yml.ko`) +- **Thanos** for multi-cluster metrics +- **Cortex** for horizontally scalable storage diff --git a/docker-compose-alloy/README-ALLOY.md b/docker-compose-alloy/README-ALLOY.md new file mode 100644 index 0000000..708f97a --- /dev/null +++ b/docker-compose-alloy/README-ALLOY.md @@ -0,0 +1,148 @@ +# Stack Observability con Grafana Alloy + +Questa directory contiene uno stack di observability **semplificato** che usa **Grafana Alloy** al posto di tre componenti separati. + +## Differenze con docker-compose originale + +### Setup Originale (7 servizi) +``` +- Promtail → Raccolta log dai container +- Node Exporter → Metriche sistema host +- OTEL Collector → Ricezione telemetria OTLP +- Prometheus → Storage metriche +- Loki → Storage log +- Tempo → Storage traces +- Grafana → Visualizzazione +``` + +### Setup con Alloy (5 servizi) ⭐ +``` +- Alloy → Raccolta log + metriche host + OTLP (tutto in uno!) +- Prometheus → Storage metriche +- Loki → Storage log +- Tempo → Storage traces +- Grafana → Visualizzazione ++ Pyroscope → Continuous profiling +``` + +## Cosa fa Alloy + +Grafana Alloy sostituisce **3 servizi** in uno: + +| Funzione | Prima | Dopo | +|----------|-------|------| +| **Raccolta log Docker** | Promtail | Alloy | +| **Metriche sistema host** | Node Exporter | Alloy (unix exporter integrato) | +| **Ricezione OTLP** | OTEL Collector | Alloy (otelcol receiver) | + +### Vantaggi di Alloy: +- ✅ **Meno container** da gestire (5 invece di 7) +- ✅ **Configurazione unificata** in un solo file (alloy-config.alloy) +- ✅ **Meno risorse** consumate (CPU/RAM) +- ✅ **UI integrata** per debugging (http://localhost:12345) +- ✅ **Più moderno** (linguaggio River invece di YAML) +- ✅ **Hot reload** della configurazione + +### Come funziona: + +1. **Logs**: Alloy scopre i container con label `logging=alloy` e invia i log a Loki +2. **Metrics**: Alloy raccoglie metriche host e le invia a Prometheus via Remote Write +3. **Traces**: Alloy riceve traces OTLP e le inoltra a Tempo + +## Porte esposte + +| Servizio | Porta | Descrizione | +|----------|-------|-------------| +| Alloy | 12345 | UI e metriche self-monitoring | +| Alloy | 4317 | OTLP gRPC (traces/metrics) | +| Alloy | 4318 | OTLP HTTP (traces/metrics) | +| Prometheus | 9090 | Query e UI | +| Loki | 3100 | Push/Query API | +| Tempo | 3200 | Query API | +| Pyroscope | 4040 | Profiling UI e API | +| Grafana | 3000 | Dashboard | + +## Come usare + +### Avviare lo stack: +```bash +cd docker-compose-alloy +docker-compose up -d +``` + +### Verificare Alloy: +```bash +# UI di Alloy (molto utile per debugging!) +open http://localhost:12345 + +# Vedere i target scoperti +curl http://localhost:12345/api/v0/component/discovery.docker.containers/targets + +# Vedere la configurazione caricata +curl http://localhost:12345/api/v0/web/components +``` + +### Accedere ai servizi: +- **Grafana**: http://localhost:3000 (admin/admin) +- **Alloy UI**: http://localhost:12345 +- **Prometheus**: http://localhost:9090 +- **Pyroscope**: http://localhost:4040 + +## Configurazione + +### File principale: +- **alloy-config.alloy**: Configurazione unificata di Alloy (sostituisce promtail-config.yml e otel-collector-config.yml) +- **prometheus.yml**: Configurazione Prometheus (semplificata, Alloy fa remote write) +- **loki-config.yml**: Configurazione Loki (invariata) +- **tempo-config.yml**: Configurazione Tempo (invariata) + +### Modificare la configurazione di Alloy: + +1. Modifica `alloy-config.alloy` +2. Riavvia il container: `docker-compose restart alloy` +3. Verifica la configurazione: http://localhost:12345 + +## Metriche disponibili + +Alloy espone le stesse metriche di Node Exporter con prefisso `node_*`: +```promql +# CPU usage +rate(node_cpu_seconds_total[5m]) + +# Memory +node_memory_MemAvailable_bytes + +# Disk +node_filesystem_avail_bytes +``` + +## Troubleshooting + +### Verificare che Alloy stia raccogliendo log: +```bash +docker-compose logs alloy | grep loki +``` + +### Verificare che Alloy stia inviando metriche a Prometheus: +```bash +curl http://localhost:9090/api/v1/label/__name__/values | grep node_ +``` + +### Vedere i componenti attivi in Alloy: +```bash +curl http://localhost:12345/api/v0/web/components | jq +``` + +## Migrazione dalla versione originale + +Se hai già la versione con Promtail/Node-Exporter/OTEL: + +1. Ferma lo stack vecchio: `cd ../docker-compose && docker-compose down` +2. Avvia il nuovo: `cd ../docker-compose-alloy && docker-compose up -d` +3. I dati storici in Prometheus/Loki/Tempo sono preservati nei volumi Docker + +## Link utili + +- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/) +- [Alloy Configuration Reference](https://grafana.com/docs/alloy/latest/reference/) +- [River Language](https://grafana.com/docs/alloy/latest/concepts/configuration-syntax/) diff --git a/docker-compose-alloy/README.md b/docker-compose-alloy/README.md new file mode 100644 index 0000000..045c01c --- /dev/null +++ b/docker-compose-alloy/README.md @@ -0,0 +1,195 @@ +# Symon OpenTelemetry Docker Compose Setup + +This directory contains a Docker Compose setup for running an observability stack to monitor Symon with OpenTelemetry. + +## Architecture + +The stack includes: + +1. **OpenTelemetry Collector** - Receives metrics from Symon via OTLP protocol +2. **Prometheus** - Scrapes and stores metrics from the OTEL Collector +3. **Grafana** - Visualizes metrics from Prometheus + +``` +Symon (with --headless flag) + ↓ (OTLP/gRPC on port 4317) +OpenTelemetry Collector + ↓ (Prometheus scrape on port 8889) +Prometheus + ↓ (Query on port 9090) +Grafana (accessible on port 3000) +``` + +## Quick Start + +### 1. Start the observability stack + +```bash +cd docker-compose +docker-compose up -d +``` + +This will start: +- OpenTelemetry Collector on ports 4317 (gRPC), 4318 (HTTP), 8889 (metrics) +- Prometheus on port 9090 +- Grafana on port 3000 + +### 2. Build Symon with OpenTelemetry support + +```bash +cd .. +cargo build --release --features opentelemetry +``` + +### 3. Create a configuration file + +Create a `Symon-config.toml` file: + +```toml +[opentelemetry] +enabled = true +endpoint = "http://localhost:4317" +service_name = "Symon-system-monitor" +export_interval_ms = 5000 + +[opentelemetry.metrics] +cpu = true +memory = true +network = true +disk = true +processes = true +temperature = true +gpu = true +``` + +### 4. Run Symon in headless mode + +```bash +./target/release/btm --config Symon-config.toml --headless +``` + +Or without config file: + +```bash +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 \ +./target/release/btm --headless +``` + +### 5. Access the dashboards + +- **Prometheus**: http://localhost:9090 +- **Grafana**: http://localhost:3000 (username: `admin`, password: `admin`) + +## Configuration Files + +### otel-collector-config.yml + +Configures the OpenTelemetry Collector to: +- Receive OTLP data on ports 4317 (gRPC) and 4318 (HTTP) +- Export metrics in Prometheus format on port 9090 +- Debug log all received data + +### prometheus.yml + +Configures Prometheus to: +- Scrape metrics from the OTEL Collector every 10 seconds +- Load alerting rules from `rules/Symon_rules.yml` + +### rules/Symon_rules.yml + +Contains Prometheus recording rules for Symon metrics, including: +- Recent process CPU usage metrics +- Recent process memory usage metrics + +## Viewing Metrics in Prometheus + +1. Go to http://localhost:9090 +2. Click on "Graph" +3. Try these example queries: + +```promql +# CPU usage by core +system_cpu_usage_percent + +# Memory usage +system_memory_usage_bytes + +# Network RX/TX +system_network_rx_bytes +system_network_tx_bytes + +# Disk usage +system_disk_usage_bytes + +# Top processes by CPU +topk(10, system_process_cpu_usage_percent) + +# Top processes by memory +topk(10, system_process_memory_usage_bytes) +``` + +## Grafana Configuration + +Grafana is automatically configured with: +- **Prometheus data source** (http://prometheus:9090) - pre-configured +- **Symon System Overview dashboard** - pre-loaded + +To access: +1. Go to http://localhost:3000 (username: `admin`, password: `admin`) +2. Navigate to Dashboards → Browse → "Symon System Overview" + +The dashboard includes: +- CPU usage by core +- Memory usage (RAM/Swap) +- Network traffic +- Disk usage +- Top 10 processes by CPU +- Top 10 processes by Memory + +## Stopping the Stack + +```bash +docker-compose down +``` + +To also remove volumes: + +```bash +docker-compose down -v +``` + +## Troubleshooting + +### Symon not sending metrics + +Check the OTEL Collector logs: +```bash +docker-compose logs -f otel-collector +``` + +You should see messages about receiving metrics. + +### Prometheus not scraping + +1. Check Prometheus targets at http://localhost:9090/targets +2. The `otel-collector` target should be UP + +### No data in Grafana + +1. Verify Prometheus data source is configured correctly +2. Check that Prometheus has data by querying directly +3. Ensure your time range in Grafana includes when Symon was running + +## Advanced Configuration + +### Using with TimescaleDB (optional) + +A TimescaleDB configuration file is available as `docker-compose-timescale.yml.ko` for long-term storage of metrics. Rename it to include it in your stack. + +### Custom Prometheus Rules + +Edit `rules/Symon_rules.yml` to add custom recording or alerting rules. + +### OTEL Collector Sampling + +Edit `otel-collector-config.yml` to adjust the batch processor settings for different performance characteristics. diff --git a/docker-compose-alloy/alloy-config.alloy b/docker-compose-alloy/alloy-config.alloy new file mode 100644 index 0000000..ea46c4e --- /dev/null +++ b/docker-compose-alloy/alloy-config.alloy @@ -0,0 +1,169 @@ +// Grafana Alloy Configuration +// Sostituisce: Promtail + Node Exporter + OTEL Collector + +// ============================================================================ +// LOGGING - Raccolta log dai container Docker (sostituisce Promtail) +// ============================================================================ + +// Scopri container Docker con label logging=promtail +discovery.docker "containers" { + host = "unix:///var/run/docker.sock" + + filter { + name = "label" + values = ["logging=promtail"] + } + + refresh_interval = "5s" +} + +// Scrape log dai container scoperti +loki.source.docker "containers" { + host = "unix:///var/run/docker.sock" + targets = discovery.docker.containers.targets + forward_to = [loki.relabel.docker.receiver] +} + +// Relabeling per aggiungere label ai log +loki.relabel "docker" { + forward_to = [loki.write.default.receiver] + + rule { + source_labels = ["__meta_docker_container_name"] + regex = "/(.*)" + target_label = "container" + } + + rule { + source_labels = ["__meta_docker_container_label_logging_jobname"] + target_label = "job" + } +} + +// Scrivi i log a Loki +loki.write "default" { + endpoint { + url = "http://loki:3100/loki/api/v1/push" + } + + external_labels = { + environment = "production", + cluster = "myapp-cluster", + } +} + +// ============================================================================ +// METRICS - Metriche sistema host (sostituisce Node Exporter) +// ============================================================================ + +// Scraping metriche locali dell'host +prometheus.exporter.unix "host" { + // Raccoglie metriche del sistema operativo + set_collectors = [ + "cpu", + "loadavg", + "meminfo", + "netdev", + "diskstats", + "filesystem", + "uname", + "time", + ] +} + +// Scrape delle metriche raccolte +prometheus.scrape "host_metrics" { + targets = prometheus.exporter.unix.host.targets + forward_to = [prometheus.remote_write.default.receiver] + + scrape_interval = "10s" + + clustering { + enabled = false + } +} + +// Scraping self-monitoring di Alloy +prometheus.scrape "alloy_metrics" { + targets = [{ + __address__ = "localhost:12345", + }] + forward_to = [prometheus.remote_write.default.receiver] + + scrape_interval = "10s" +} + +// ============================================================================ +// TRACES - Ricezione traces OTLP (sostituisce OTEL Collector) +// ============================================================================ + +// Ricevi traces via OTLP gRPC +otelcol.receiver.otlp "default" { + grpc { + endpoint = "0.0.0.0:4317" + } + + http { + endpoint = "0.0.0.0:4318" + } + + output { + metrics = [otelcol.processor.batch.default.input] + logs = [otelcol.processor.batch.default.input] + traces = [otelcol.processor.batch.default.input] + } +} + +// Batch processor per ottimizzare l'invio +otelcol.processor.batch "default" { + timeout = "10s" + send_batch_size = 10000 + send_batch_max_size = 11000 + + output { + metrics = [otelcol.exporter.prometheus.default.input] + traces = [otelcol.exporter.otlp.tempo.input] + logs = [otelcol.exporter.loki.default.input] + } +} + +// Esporta metriche OTLP a Prometheus +otelcol.exporter.prometheus "default" { + forward_to = [prometheus.remote_write.default.receiver] +} + +// Esporta traces a Tempo +otelcol.exporter.otlp "tempo" { + client { + endpoint = "tempo:4317" + tls { + insecure = true + } + } +} + +// Esporta log OTLP a Loki +otelcol.exporter.loki "default" { + forward_to = [loki.write.default.receiver] +} + +// ============================================================================ +// REMOTE WRITE - Invia metriche a Prometheus +// ============================================================================ + +prometheus.remote_write "default" { + endpoint { + url = "http://prometheus:9090/api/v1/write" + + metadata_config { + send_interval = "1m" + } + + queue_config { + capacity = 10000 + max_shards = 10 + min_shards = 1 + max_samples_per_send = 5000 + } + } +} diff --git a/docker-compose-alloy/docker-compose.yml b/docker-compose-alloy/docker-compose.yml new file mode 100644 index 0000000..6d8440e --- /dev/null +++ b/docker-compose-alloy/docker-compose.yml @@ -0,0 +1,120 @@ +services: + + alloy: + image: grafana/alloy:latest + container_name: alloy + command: + - run + - /etc/alloy/config.alloy + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + volumes: + - ./alloy-config.alloy:/etc/alloy/config.alloy:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "12345:12345" # Alloy UI and metrics + networks: + - observ-net + restart: unless-stopped + privileged: true + labels: + logging: "alloy" + + prometheus: + image: prom/prometheus:latest + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ./rules:/etc/prometheus/rules + ports: + - "9090:9090" # Interfaccia Web di Prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.enable-remote-write-receiver' + - '--enable-feature=exemplar-storage' + depends_on: + - alloy + networks: + - observ-net + labels: + logging: "alloy" + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_SECURITY_ADMIN_USER=admin + volumes: + - grafana-storage:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning + depends_on: + - prometheus + networks: + - observ-net + labels: + logging: "alloy" + + loki: + image: grafana/loki:latest + container_name: loki + user: "0" + ports: + - "3100:3100" + volumes: + - ./loki-config.yml:/etc/loki/local-config.yaml + - ./loki-data:/loki + command: -config.file=/etc/loki/local-config.yaml + restart: unless-stopped + networks: + - observ-net + labels: + logging: "alloy" + + tempo: + image: grafana/tempo:latest + container_name: tempo + user: "0" + command: ["-config.file=/etc/tempo.yml"] + volumes: + - ./tempo-config.yml:/etc/tempo.yml + - tempo-data:/tmp/tempo + ports: + - "3200:3200" # Tempo HTTP + restart: unless-stopped + networks: + - observ-net + labels: + logging: "alloy" + + pyroscope: + image: grafana/pyroscope:latest + container_name: pyroscope + ports: + - "4040:4040" # Pyroscope UI and API + volumes: + - pyroscope-data:/var/lib/pyroscope + restart: unless-stopped + networks: + - observ-net + labels: + logging: "alloy" + environment: + - PYROSCOPE_LOG_LEVEL=info + + +volumes: + grafana-storage: + tempo-data: + pyroscope-data: + +networks: + observ-net: + driver: bridge \ No newline at end of file diff --git a/docker-compose-alloy/grafana/provisioning/datasources/observability_stack.yml b/docker-compose-alloy/grafana/provisioning/datasources/observability_stack.yml new file mode 100644 index 0000000..d06256d --- /dev/null +++ b/docker-compose-alloy/grafana/provisioning/datasources/observability_stack.yml @@ -0,0 +1,74 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true + jsonData: + timeInterval: 10s + queryTimeout: 60s + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: false + editable: true + jsonData: + maxLines: 1000 + derivedFields: + - datasourceUid: tempo + matcherRegex: "trace_?id[\":]\\s*\"?([0-9a-fA-F]+)" + name: TraceID + url: "$${__value.raw}" + + - name: Tempo + type: tempo + uid: tempo + access: proxy + url: http://tempo:3200 + isDefault: false + editable: true + jsonData: + httpMethod: GET + tracesToLogs: + datasourceUid: loki + mapTagNamesEnabled: true + mappedTags: + - key: service.name + value: service_name + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: true + filterBySpanID: false + tracesToMetrics: + datasourceUid: prometheus + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + tracesToProfiles: + datasourceUid: pyroscope + tags: + - key: service.name + value: service_name + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true + search: + hide: false + lokiSearch: + datasourceUid: loki + + - name: Pyroscope + type: grafana-pyroscope-datasource + uid: pyroscope + access: proxy + url: http://pyroscope:4040 + isDefault: false + editable: true + jsonData: + keepCookies: [] + minStep: '15s' diff --git a/docker-compose-alloy/loki-config.yml b/docker-compose-alloy/loki-config.yml new file mode 100644 index 0000000..319d1a7 --- /dev/null +++ b/docker-compose-alloy/loki-config.yml @@ -0,0 +1,43 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +limits_config: + retention_period: 720h + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + volume_enabled: true diff --git a/docker-compose-alloy/prometheus.yml b/docker-compose-alloy/prometheus.yml new file mode 100644 index 0000000..0236393 --- /dev/null +++ b/docker-compose-alloy/prometheus.yml @@ -0,0 +1,22 @@ +global: + scrape_interval: 10s + evaluation_interval: 10s + +rule_files: + - /etc/prometheus/rules/*.yml + +scrape_configs: + # Job 1: Monitora se Prometheus stesso è attivo + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # Job 2: Scrape metriche di Alloy (self-monitoring) + - job_name: 'alloy' + static_configs: + - targets: ['alloy:12345'] + labels: + instance: 'alloy' + +# NOTA: Le metriche dell'host (ex-Node Exporter) e OTLP arrivano via Remote Write da Alloy + diff --git a/docker-compose-alloy/rules/symon_rules.yml b/docker-compose-alloy/rules/symon_rules.yml new file mode 100644 index 0000000..16ceb96 --- /dev/null +++ b/docker-compose-alloy/rules/symon_rules.yml @@ -0,0 +1,15 @@ +groups: + - name: symon_process_metrics + interval: 30s + rules: + - record: system_process_cpu_usage_percent:recent + expr: | + system_process_cpu_usage_percent + and on(pid, name) + (time() - timestamp(system_process_cpu_usage_percent) < 120) + + - record: system_process_memory_usage_bytes:recent + expr: | + system_process_memory_usage_bytes + and on(pid, name) + (time() - timestamp(system_process_memory_usage_bytes) < 120) \ No newline at end of file diff --git a/docker-compose-alloy/tempo-config.yml b/docker-compose-alloy/tempo-config.yml new file mode 100644 index 0000000..a8ab2bf --- /dev/null +++ b/docker-compose-alloy/tempo-config.yml @@ -0,0 +1,49 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 48h + +storage: + trace: + backend: local + wal: + path: /tmp/tempo/wal + local: + path: /tmp/tempo/blocks + +query_frontend: + search: + duration_slo: 5s + throughput_bytes_slo: 1.073741824e+09 + trace_by_id: + duration_slo: 5s + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus:9090/api/v1/write + send_exemplars: true + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] diff --git a/docker-compose-alloy/test-stack.sh b/docker-compose-alloy/test-stack.sh new file mode 100755 index 0000000..d90d409 --- /dev/null +++ b/docker-compose-alloy/test-stack.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# Test script to verify the observability stack is running correctly + +set -e + +echo "🔍 Testing Symon OpenTelemetry Stack..." +echo "" + +# Colors +GREEN='\033[0;32m' +RED='\033[0;31m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Test OTEL Collector gRPC endpoint +echo -n "Testing OTEL Collector gRPC (port 4317)... " +if nc -zv localhost 4317 2>&1 | grep -q "succeeded\|open"; then + echo -e "${GREEN}✓ OK${NC}" +else + echo -e "${RED}✗ FAILED${NC}" + exit 1 +fi + +# Test OTEL Collector HTTP endpoint +echo -n "Testing OTEL Collector HTTP (port 4318)... " +if nc -zv localhost 4318 2>&1 | grep -q "succeeded\|open"; then + echo -e "${GREEN}✓ OK${NC}" +else + echo -e "${RED}✗ FAILED${NC}" + exit 1 +fi + +# Test OTEL Collector metrics endpoint +echo -n "Testing OTEL Collector metrics (port 8889)... " +if curl -s http://localhost:8889/metrics > /dev/null; then + echo -e "${GREEN}✓ OK${NC}" +else + echo -e "${RED}✗ FAILED${NC}" + exit 1 +fi + +# Test Prometheus +echo -n "Testing Prometheus (port 9090)... " +if curl -s http://localhost:9090/-/healthy | grep -q "Prometheus"; then + echo -e "${GREEN}✓ OK${NC}" +else + echo -e "${RED}✗ FAILED${NC}" + exit 1 +fi + +# Test Prometheus targets +echo -n "Testing Prometheus targets... " +TARGETS=$(curl -s http://localhost:9090/api/v1/targets | grep -o '"health":"up"' | wc -l) +if [ "$TARGETS" -gt 0 ]; then + echo -e "${GREEN}✓ OK${NC} (${TARGETS} targets up)" +else + echo -e "${YELLOW}⚠ WARNING${NC} (no targets up yet - this is normal if just started)" +fi + +# Test Grafana +echo -n "Testing Grafana (port 3000)... " +if curl -s http://localhost:3000/api/health | grep -q "ok"; then + echo -e "${GREEN}✓ OK${NC}" +else + echo -e "${RED}✗ FAILED${NC}" + exit 1 +fi + +echo "" +echo -e "${GREEN}✓ All tests passed!${NC}" +echo "" +echo "📊 Access points:" +echo " - Prometheus: http://localhost:9090" +echo " - Grafana: http://localhost:3000 (admin/admin)" +echo " - OTEL Collector metrics: http://localhost:8889/metrics" +echo "" +echo "💡 Next steps:" +echo " 1. Build Symon with: cargo build --release --features opentelemetry" +echo " 2. Run in headless mode: ./target/release/btm --headless" +echo " 3. Check metrics in Prometheus: http://localhost:9090/graph"