version: "3.8" # AI Stack Logging Infrastructure # Loki (logs) + Prometheus (metrics) + Grafana (visualization) services: # ============================================================================= # LOKI - Log Aggregation # ============================================================================= loki: image: grafana/loki:2.9.0 container_name: ai-stack-loki ports: - "3100:3100" volumes: - ./config/loki-config.yml:/etc/loki/local-config.yaml:ro - loki-data:/loki command: -config.file=/etc/loki/local-config.yaml networks: - logging-network restart: unless-stopped healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] interval: 30s timeout: 10s retries: 3 # ============================================================================= # PROMTAIL - Log Collector (ships logs to Loki) # ============================================================================= promtail: image: grafana/promtail:2.9.0 container_name: ai-stack-promtail volumes: - ./config/promtail-config.yml:/etc/promtail/config.yml:ro - /var/log:/var/log:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro - /var/run/docker.sock:/var/run/docker.sock:ro command: -config.file=/etc/promtail/config.yml networks: - logging-network depends_on: - loki restart: unless-stopped # ============================================================================= # PROMETHEUS - Metrics Collection # ============================================================================= prometheus: image: prom/prometheus:v2.47.0 container_name: ai-stack-prometheus ports: - "9090:9090" volumes: - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./alerting:/etc/prometheus/alerting:ro - prometheus-data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=90d' - '--web.enable-lifecycle' - '--web.enable-admin-api' networks: - logging-network restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 # ============================================================================= # GRAFANA - Visualization & Dashboards # ============================================================================= grafana: image: grafana/grafana:10.2.0 container_name: ai-stack-grafana ports: - "3001:3000" environment: - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP=false - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001} - GF_INSTALL_PLUGINS=grafana-piechart-panel volumes: - ./config/grafana/provisioning:/etc/grafana/provisioning:ro - ./dashboards:/var/lib/grafana/dashboards:ro - grafana-data:/var/lib/grafana networks: - logging-network depends_on: - loki - prometheus restart: unless-stopped healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] interval: 30s timeout: 10s retries: 3 # ============================================================================= # LOG INGEST API - Custom endpoint for AI stack events # ============================================================================= log-ingest: build: context: ./log-ingest dockerfile: Dockerfile container_name: ai-stack-log-ingest ports: - "3102:3000" environment: - LOKI_URL=http://loki:3100 - LOG_LEVEL=info networks: - logging-network depends_on: - loki restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"] interval: 30s timeout: 10s retries: 3 networks: logging-network: driver: bridge name: ai-stack-logging volumes: loki-data: name: ai-stack-loki-data prometheus-data: name: ai-stack-prometheus-data grafana-data: name: ai-stack-grafana-data