feat: add comprehensive logging infrastructure
- Add Loki/Prometheus/Grafana stack in logging-stack/ - Add log-ingest service for receiving events from AI stacks - Add Grafana dashboard with stack_name filtering - Update Dokploy client with setApplicationEnv method - Configure STACK_NAME env var for deployed stacks - Add alerting rules for stack health monitoring
This commit is contained in:
138
logging-stack/docker-compose.yml
Normal file
138
logging-stack/docker-compose.yml
Normal file
@@ -0,0 +1,138 @@
|
||||
version: "3.8"
|
||||
|
||||
# AI Stack Logging Infrastructure
|
||||
# Loki (logs) + Prometheus (metrics) + Grafana (visualization)
|
||||
|
||||
services:
|
||||
# =============================================================================
|
||||
# LOKI - Log Aggregation
|
||||
# =============================================================================
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
container_name: ai-stack-loki
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ./config/loki-config.yml:/etc/loki/local-config.yaml:ro
|
||||
- loki-data:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
networks:
|
||||
- logging-network
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# PROMTAIL - Log Collector (ships logs to Loki)
|
||||
# =============================================================================
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: ai-stack-promtail
|
||||
volumes:
|
||||
- ./config/promtail-config.yml:/etc/promtail/config.yml:ro
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# PROMETHEUS - Metrics Collection
|
||||
# =============================================================================
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.47.0
|
||||
container_name: ai-stack-prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./alerting:/etc/prometheus/alerting:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=90d'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.enable-admin-api'
|
||||
networks:
|
||||
- logging-network
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# GRAFANA - Visualization & Dashboards
|
||||
# =============================================================================
|
||||
grafana:
|
||||
image: grafana/grafana:10.2.0
|
||||
container_name: ai-stack-grafana
|
||||
ports:
|
||||
- "3001:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
|
||||
- GF_INSTALL_PLUGINS=grafana-piechart-panel
|
||||
volumes:
|
||||
- ./config/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana-data:/var/lib/grafana
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
- prometheus
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# LOG INGEST API - Custom endpoint for AI stack events
|
||||
# =============================================================================
|
||||
log-ingest:
|
||||
build:
|
||||
context: ./log-ingest
|
||||
dockerfile: Dockerfile
|
||||
container_name: ai-stack-log-ingest
|
||||
ports:
|
||||
- "3102:3000"
|
||||
environment:
|
||||
- LOKI_URL=http://loki:3100
|
||||
- LOG_LEVEL=info
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
networks:
|
||||
logging-network:
|
||||
driver: bridge
|
||||
name: ai-stack-logging
|
||||
|
||||
volumes:
|
||||
loki-data:
|
||||
name: ai-stack-loki-data
|
||||
prometheus-data:
|
||||
name: ai-stack-prometheus-data
|
||||
grafana-data:
|
||||
name: ai-stack-grafana-data
|
||||
Reference in New Issue
Block a user