diff --git a/docs/METRICS.md b/docs/METRICS.md new file mode 100644 index 0000000..7bf6ea8 --- /dev/null +++ b/docs/METRICS.md @@ -0,0 +1,232 @@ +# AgentHub Metrics & Monitoring + +AgentHub expose des métriques Prometheus via l'endpoint `/metrics` pour monitoring et observabilité. + +## 🎯 Endpoint Metrics + +**URL**: `GET /metrics` +**Format**: Prometheus text format (OpenMetrics) +**Auth**: Non requis (endpoint public pour scraping) + +### Exemple + +```bash +curl http://localhost:3000/metrics +``` + +## 📊 Métriques Exposées + +### Métriques AgentHub + +| Métrique | Type | Description | +|----------|------|-------------| +| `agenthub_agents_connected` | Gauge | Nombre d'agents connectés via WebSocket | +| `agenthub_rooms_active` | Gauge | Nombre de rooms actives (avec au moins 1 membre) | +| `agenthub_messages_total` | Counter | Total des messages envoyés | +| `agenthub_websocket_latency_seconds` | Histogram | Latence d'envoi de messages WebSocket (p50, p95, p99) | +| `agenthub_http_requests_total` | Counter | Total des requêtes HTTP par route et status | +| `agenthub_http_request_duration_seconds` | Histogram | Durée des requêtes HTTP par route | +| `agenthub_db_query_duration_seconds` | Histogram | Durée des requêtes DB par opération | + +### Métriques Node.js (default) + +Collectées automatiquement par `prom-client`: + +- `agenthub_nodejs_heap_size_total_bytes` - Mémoire heap totale +- `agenthub_nodejs_heap_size_used_bytes` - Mémoire heap utilisée +- `agenthub_nodejs_external_memory_bytes` - Mémoire externe +- `agenthub_nodejs_eventloop_lag_seconds` - Lag de l'event loop +- `agenthub_process_cpu_user_seconds_total` - CPU user time +- `agenthub_process_cpu_system_seconds_total` - CPU system time + +## 🔧 Intégration Prometheus + +### Configuration Prometheus + +Ajouter ce job dans `prometheus.yml`: + +```yaml +scrape_configs: + - job_name: 'agenthub' + scrape_interval: 15s + static_configs: + - targets: ['192.168.9.23:3000'] # Remplacer par votre IP/host +``` + +### Queries PromQL Utiles + +```promql +# Agents connectés en temps réel +agenthub_agents_connected + +# Rooms actives +agenthub_rooms_active + +# Messages par seconde (rate 5min) +rate(agenthub_messages_total[5m]) + +# Latence WebSocket p99 +histogram_quantile(0.99, sum by(le) (rate(agenthub_websocket_latency_seconds_bucket[5m]))) + +# Requêtes HTTP par route (top 5) +topk(5, sum by(route) (rate(agenthub_http_requests_total[5m]))) + +# Durée requête DB p95 par opération +histogram_quantile(0.95, sum by(le, operation) (rate(agenthub_db_query_duration_seconds_bucket[5m]))) +``` + +## 📈 Dashboard Grafana + +### Import Rapide + +1. Ouvrir Grafana → Dashboards → Import +2. Uploader `docs/grafana-dashboard.json` +3. Sélectionner votre datasource Prometheus +4. Cliquer "Import" + +### Dashboard Inclus + +Le template Grafana contient 7 panels: + +1. **Agents Connectés** (gauge) - Nombre d'agents WebSocket +2. **Rooms Actives** (gauge) - Nombre de rooms avec membres +3. **Messages envoyés** (timeseries) - Rate de messages/sec +4. **WebSocket Latency** (timeseries) - p50, p95, p99 +5. **HTTP Request Duration** (timeseries) - Latence par route +6. **HTTP Requests/sec** (timeseries) - Requêtes par route et status +7. **Database Query Duration** (timeseries) - Latence DB par opération + +Refresh automatique toutes les 10 secondes. + +## 🔍 Queries d'Exemple + +### Curl brut + +```bash +# Récupérer toutes les métriques +curl http://192.168.9.23:3000/metrics + +# Filtrer une métrique spécifique +curl http://192.168.9.23:3000/metrics | grep agenthub_agents_connected +``` + +### Avec jq (pour format JSON si besoin) + +Les métriques Prometheus sont en format texte, mais vous pouvez les parser: + +```bash +curl -s http://192.168.9.23:3000/metrics \ + | grep -E '^agenthub_agents_connected ' \ + | awk '{print $2}' +``` + +## 🚀 Déploiement + +### Stack Prometheus + Grafana (Docker Compose) + +Exemple de stack complète: + +```yaml +version: '3.8' + +services: + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus_data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + + grafana: + image: grafana/grafana:latest + ports: + - "3001:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + volumes: + - grafana_data:/var/lib/grafana + +volumes: + prometheus_data: + grafana_data: +``` + +### Healthcheck avec Métriques + +L'endpoint `/metrics` peut servir de healthcheck: + +```bash +# Vérifier que le serveur répond +curl -f http://192.168.9.23:3000/metrics || exit 1 +``` + +## 🔐 Sécurité + +### Considérations + +- L'endpoint `/metrics` est **public** (pas d'auth requise) pour faciliter le scraping Prometheus +- Si besoin de sécuriser, ajouter un reverse proxy (nginx) avec basic auth +- Les métriques n'exposent **pas de données sensibles** (pas de body messages, tokens, etc.) +- Seules les **métadonnées agrégées** sont exposées (counts, latencies, etc.) + +### Exemple nginx avec basic auth + +```nginx +location /metrics { + auth_basic "Prometheus Metrics"; + auth_basic_user_file /etc/nginx/.htpasswd; + proxy_pass http://agenthub:3000/metrics; +} +``` + +## 📝 Troubleshooting + +### Les métriques ne s'affichent pas + +1. Vérifier que le serveur AgentHub est démarré: + ```bash + curl http://192.168.9.23:3000/healthz + ``` + +2. Vérifier l'endpoint metrics: + ```bash + curl http://192.168.9.23:3000/metrics + ``` + +3. Vérifier les logs Prometheus: + ```bash + docker logs prometheus + ``` + +### Les métriques sont à zéro + +- `agenthub_agents_connected`: Aucun agent WebSocket connecté +- `agenthub_rooms_active`: Aucune room avec membres dans la DB +- `agenthub_messages_total`: Aucun message envoyé depuis le dernier restart + +C'est normal si l'application vient de démarrer ou n'a pas encore d'activité. + +### Grafana n'affiche pas de données + +1. Vérifier la datasource Prometheus dans Grafana: + - Settings → Data Sources → Prometheus + - Tester la connexion + +2. Vérifier que Prometheus scrape bien AgentHub: + - Ouvrir Prometheus UI: `http://localhost:9090/targets` + - Vérifier que le job `agenthub` est UP + +3. Vérifier les queries dans Grafana: + - Explore → Sélectionner Prometheus + - Tester une query simple: `agenthub_agents_connected` + +## 🎓 Références + +- [Prometheus Documentation](https://prometheus.io/docs/) +- [Grafana Documentation](https://grafana.com/docs/) +- [prom-client (Node.js)](https://github.com/siimon/prom-client) +- [OpenMetrics Specification](https://openmetrics.io/) diff --git a/docs/grafana-dashboard.json b/docs/grafana-dashboard.json new file mode 100644 index 0000000..a194c98 --- /dev/null +++ b/docs/grafana-dashboard.json @@ -0,0 +1,668 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "agenthub_agents_connected", + "legendFormat": "Agents connectés", + "range": true, + "refId": "A" + } + ], + "title": "Agents Connectés", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "options": { + "orientation": "auto", + "reduceOptions": { + "values": false, + "calcs": [ + "lastNotNull" + ], + "fields": "" + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "agenthub_rooms_active", + "legendFormat": "Rooms actives", + "range": true, + "refId": "A" + } + ], + "title": "Rooms Actives", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "rate(agenthub_messages_total[5m])", + "legendFormat": "Messages/sec", + "range": true, + "refId": "A" + } + ], + "title": "Messages envoyés (rate)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le) (rate(agenthub_websocket_latency_seconds_bucket[5m])))", + "legendFormat": "p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le) (rate(agenthub_websocket_latency_seconds_bucket[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(agenthub_websocket_latency_seconds_bucket[5m])))", + "legendFormat": "p99", + "range": true, + "refId": "C" + } + ], + "title": "WebSocket Latency (message send)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le, route) (rate(agenthub_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "{{route}} p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le, route) (rate(agenthub_http_request_duration_seconds_bucket[5m])))", + "legendFormat": "{{route}} p99", + "range": true, + "refId": "B" + } + ], + "title": "HTTP Request Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "sum by(route, status_code) (rate(agenthub_http_requests_total[5m]))", + "legendFormat": "{{route}} {{status_code}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Requests/sec by route", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum by(le, operation) (rate(agenthub_db_query_duration_seconds_bucket[5m])))", + "legendFormat": "{{operation}} p50", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le, operation) (rate(agenthub_db_query_duration_seconds_bucket[5m])))", + "legendFormat": "{{operation}} p99", + "range": true, + "refId": "B" + } + ], + "title": "Database Query Duration", + "type": "timeseries" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "agenthub", + "barodine" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "AgentHub Dashboard", + "uid": "agenthub-dashboard", + "version": 0, + "weekStart": "" +} diff --git a/src/app.ts b/src/app.ts index a199b8f..7f53ebd 100644 --- a/src/app.ts +++ b/src/app.ts @@ -2,12 +2,14 @@ import Fastify, { type FastifyInstance } from 'fastify'; import type { AppConfig } from './config.js'; import { pool } from './db/pool.js'; import { registerSecurityPlugins } from './lib/security.js'; +import { registerInstrumentation } from './lib/instrumentation.js'; import { registerAgentRoutes } from './routes/agents.js'; import { registerTokenRoutes } from './routes/tokens.js'; import { registerSessionRoutes } from './routes/sessions.js'; import { registerRoomRoutes } from './routes/rooms.js'; import { setupSocketIO } from './socket/index.js'; import { register as metricsRegister } from './lib/metrics.js'; +import { startMetricsCollector } from './services/metrics-collector.js'; export interface BuildAppOptions { config: AppConfig; @@ -22,6 +24,9 @@ export async function buildApp({ config }: BuildAppOptions): Promise { return { status: 'ok', uptime: process.uptime() }; }); @@ -64,5 +69,9 @@ export async function buildApp({ config }: BuildAppOptions): Promise { + app.addHook('onRequest', async (request: FastifyRequest, _reply: FastifyReply) => { + (request as any).startTime = performance.now(); + }); + + app.addHook('onResponse', async (request: FastifyRequest, reply: FastifyReply) => { + const startTime = (request as any).startTime; + if (startTime === undefined) return; + + const duration = (performance.now() - startTime) / 1000; // Convert to seconds + const method = request.method; + const route = request.routeOptions?.url || request.url.split('?')[0] || 'unknown'; + const statusCode = reply.statusCode.toString(); + + // Record metrics + httpRequestDurationHistogram.observe({ method, route, status_code: statusCode }, duration); + httpRequestsCounter.inc({ method, route, status_code: statusCode }); + }); +} diff --git a/src/lib/metrics.ts b/src/lib/metrics.ts index 2b73b42..58863a6 100644 --- a/src/lib/metrics.ts +++ b/src/lib/metrics.ts @@ -1,25 +1,35 @@ -import { Registry, Gauge, Counter, Histogram } from 'prom-client'; +import { Registry, Gauge, Counter, Histogram, collectDefaultMetrics } from 'prom-client'; export const register = new Registry(); -// Active WebSocket connections +// Collect default Node.js metrics (memory, CPU, event loop, etc.) +collectDefaultMetrics({ register, prefix: 'agenthub_' }); + +// Active WebSocket connections (agents connected) export const wsConnectionsGauge = new Gauge({ - name: 'agenthub_ws_connections_active', - help: 'Number of active WebSocket connections', + name: 'agenthub_agents_connected', + help: 'Number of connected agents via WebSocket', + registers: [register], +}); + +// Active rooms (rooms with at least one member) +export const roomsActiveGauge = new Gauge({ + name: 'agenthub_rooms_active', + help: 'Number of active rooms (with at least one member)', registers: [register], }); // Total messages sent export const messagesSentCounter = new Counter({ - name: 'agenthub_messages_sent_total', + name: 'agenthub_messages_total', help: 'Total number of messages sent', registers: [register], }); // Message send latency (p50, p99) export const messageSendLatencyHistogram = new Histogram({ - name: 'agenthub_message_send_latency_seconds', - help: 'Message send latency in seconds', + name: 'agenthub_websocket_latency_seconds', + help: 'WebSocket message send latency in seconds', buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5], registers: [register], }); @@ -28,7 +38,24 @@ export const messageSendLatencyHistogram = new Histogram({ export const httpRequestDurationHistogram = new Histogram({ name: 'agenthub_http_request_duration_seconds', help: 'HTTP request duration in seconds', - labelNames: ['method', 'route', 'status'], + labelNames: ['method', 'route', 'status_code'], + buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], + registers: [register], +}); + +// HTTP requests total +export const httpRequestsCounter = new Counter({ + name: 'agenthub_http_requests_total', + help: 'Total number of HTTP requests', + labelNames: ['method', 'route', 'status_code'], + registers: [register], +}); + +// Database query duration +export const dbQueryDurationHistogram = new Histogram({ + name: 'agenthub_db_query_duration_seconds', + help: 'Database query duration in seconds', + labelNames: ['operation'], buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], registers: [register], }); diff --git a/src/services/metrics-collector.ts b/src/services/metrics-collector.ts new file mode 100644 index 0000000..861eaca --- /dev/null +++ b/src/services/metrics-collector.ts @@ -0,0 +1,42 @@ +import type { Pool } from 'pg'; +import { drizzle } from 'drizzle-orm/node-postgres'; +import { roomMembers } from '../db/schema.js'; +import { sql } from 'drizzle-orm'; +import { roomsActiveGauge, dbQueryDurationHistogram } from '../lib/metrics.js'; + +/** + * Collect active rooms metric from database + */ +async function collectRoomsActiveMetric(pool: Pool): Promise { + const db = drizzle(pool); + const startTime = performance.now(); + + try { + // Count distinct rooms that have at least one member + const result = await db + .select({ count: sql`count(distinct ${roomMembers.roomId})` }) + .from(roomMembers); + + const count = result[0]?.count || 0; + roomsActiveGauge.set(count); + + const duration = (performance.now() - startTime) / 1000; + dbQueryDurationHistogram.observe({ operation: 'count_active_rooms' }, duration); + } catch (err) { + console.error('Failed to collect rooms active metric:', err); + } +} + +/** + * Start periodic metrics collection + * Updates room metrics every 30 seconds + */ +export function startMetricsCollector(pool: Pool): void { + // Collect immediately on startup + collectRoomsActiveMetric(pool).catch(console.error); + + // Then collect every 30 seconds + setInterval(() => { + collectRoomsActiveMetric(pool).catch(console.error); + }, 30_000); +}