feat: add comprehensive logging infrastructure

- Add Loki/Prometheus/Grafana stack in logging-stack/ - Add log-ingest service for receiving events from AI stacks - Add Grafana dashboard with stack_name filtering - Update Dokploy client with setApplicationEnv method - Configure STACK_NAME env var for deployed stacks - Add alerting rules for stack health monitoring
2026-01-10 13:22:46 +01:00
parent e617114310
commit 2f4722acd0
16 changed files with 1631 additions and 0 deletions
--- a/docs/LOGGING-PLAN.md
+++ b/docs/LOGGING-PLAN.md
@@ -0,0 +1,435 @@
 # AI Stack Monitoring & Logging Plan
 ## Overview
 Comprehensive logging strategy for all deployed AI stacks at `*.ai.flexinit.nl` to enable:
 - Usage analytics and billing
 - Debugging and support
 - Security auditing
 - Performance optimization
 - User behavior insights
 ---
 ## 1. Log Categories
 ### 1.1 System Logs
 | Log Type | Source | Content |
 |----------|--------|---------|
 | Container stdout/stderr | Docker | OpenCode server output, errors, startup |
 | Health checks | Docker | Container health status over time |
 | Resource metrics | cAdvisor/Prometheus | CPU, memory, network, disk I/O |
 ### 1.2 OpenCode Server Logs
 | Log Type | Source | Content |
 |----------|--------|---------|
 | Server events | `--print-logs` | HTTP requests, WebSocket connections |
 | Session lifecycle | OpenCode | Session start/end, duration |
 | Tool invocations | OpenCode | Which tools used, success/failure |
 | MCP connections | OpenCode | MCP server connects/disconnects |
 ### 1.3 AI Interaction Logs
 | Log Type | Source | Content |
 |----------|--------|---------|
 | Prompts | OpenCode session | User messages (anonymized) |
 | Responses | OpenCode session | AI responses (summarized) |
 | Token usage | Provider API | Input/output tokens per request |
 | Model selection | OpenCode | Which model used per request |
 | Agent selection | oh-my-opencode | Which agent (Sisyphus, Oracle, etc.) |
 ### 1.4 User Activity Logs
 | Log Type | Source | Content |
 |----------|--------|---------|
 | File operations | OpenCode tools | Read/write/edit actions |
 | Bash commands | OpenCode tools | Commands executed |
 | Git operations | OpenCode tools | Commits, pushes, branches |
 | Web fetches | OpenCode tools | URLs accessed |
 ---
 ## 2. Architecture
 ```
 ┌─────────────────────────────────────────────────────────────────┐
 │                     AI Stack Container                          │
 │  ┌──────────────┐  ┌──────────────┐  ┌──────────────────────┐  │
 │  │   OpenCode   │  │  Fluent Bit  │  │  OpenTelemetry SDK   │  │
 │  │   Server     │──│  (sidecar)   │──│  (instrumentation)   │  │
 │  └──────────────┘  └──────┬───────┘  └──────────┬───────────┘  │
 └────────────────────────────┼────────────────────┼───────────────┘
                             │                    │
                             ▼                    ▼
 ┌─────────────────────────────────────────────────────────────────┐
 │                    Central Logging Stack                         │
 │  ┌──────────────┐  ┌──────────────┐  ┌──────────────────────┐  │
 │  │    Loki      │  │  Prometheus  │  │      Tempo           │  │
 │  │   (logs)     │  │  (metrics)   │  │    (traces)          │  │
 │  └──────┬───────┘  └──────┬───────┘  └──────────┬───────────┘  │
 │         └─────────────────┼─────────────────────┘              │
 │                           ▼                                     │
 │                    ┌──────────────┐                             │
 │                    │   Grafana    │                             │
 │                    │ (dashboard)  │                             │
 │                    └──────────────┘                             │
 └─────────────────────────────────────────────────────────────────┘
 ```
 ---
 ## 3. Implementation Plan
 ### Phase 1: Container Logging (Week 1)
 #### 3.1.1 Docker Log Driver
 ```yaml
 # docker-compose addition for each stack
 logging:
  driver: "fluentd"
  options:
    fluentd-address: "10.100.0.x:24224"
    tag: "ai-stack.{{.Name}}"
    fluentd-async: "true"
 ```
 #### 3.1.2 OpenCode Server Logs
 Modify Dockerfile CMD to capture structured logs:
 ```dockerfile
 CMD ["sh", "-c", "opencode serve --hostname 0.0.0.0 --port 8080 --mdns --print-logs --log-level INFO 2>&1 | tee /var/log/opencode/server.log"]
 ```
 #### 3.1.3 Log Rotation
 ```dockerfile
 # Add logrotate config
 RUN apt-get install -y logrotate
 COPY logrotate.conf /etc/logrotate.d/opencode
 ```
 ### Phase 2: Session & Prompt Logging (Week 2)
 #### 3.2.1 OpenCode Plugin for Logging
 Create logging hook in oh-my-opencode:
 ```typescript
 // src/hooks/logging.ts
 export const loggingHook: Hook = {
  name: 'session-logger',
  onSessionStart: async (session) => {
    await logEvent({
      type: 'session_start',
      stackName: process.env.STACK_NAME,
      sessionId: session.id,
      timestamp: new Date().toISOString()
    });
  },
  onMessage: async (message, session) => {
    await logEvent({
      type: 'message',
      stackName: process.env.STACK_NAME,
      sessionId: session.id,
      role: message.role,
      // Hash content for privacy, log length
      contentHash: hash(message.content),
      contentLength: message.content.length,
      model: session.model,
      agent: session.agent,
      timestamp: new Date().toISOString()
    });
  },
  onToolUse: async (tool, args, result, session) => {
    await logEvent({
      type: 'tool_use',
      stackName: process.env.STACK_NAME,
      sessionId: session.id,
      tool: tool.name,
      argsHash: hash(JSON.stringify(args)),
      success: !result.error,
      duration: result.duration,
      timestamp: new Date().toISOString()
    });
  }
 };
 ```
 #### 3.2.2 Log Destination Options
 **Option A: Centralized HTTP Endpoint**
 ```typescript
 async function logEvent(event: LogEvent) {
  await fetch('https://logs.ai.flexinit.nl/ingest', {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
      'X-Stack-Name': process.env.STACK_NAME,
      'X-API-Key': process.env.LOGGING_API_KEY
    },
    body: JSON.stringify(event)
  });
 }
 ```
 **Option B: Local File + Fluent Bit**
 ```typescript
 async function logEvent(event: LogEvent) {
  const logLine = JSON.stringify(event) + '\n';
  await fs.appendFile('/var/log/opencode/events.jsonl', logLine);
 }
 ```
 ### Phase 3: Metrics Collection (Week 3)
 #### 3.3.1 Prometheus Metrics Endpoint
 Add to OpenCode container:
 ```typescript
 // metrics.ts
 import { register, Counter, Histogram, Gauge } from 'prom-client';
 export const metrics = {
  sessionsTotal: new Counter({
    name: 'opencode_sessions_total',
    help: 'Total number of sessions',
    labelNames: ['stack_name']
  }),
  messagesTotal: new Counter({
    name: 'opencode_messages_total',
    help: 'Total messages processed',
    labelNames: ['stack_name', 'role', 'model', 'agent']
  }),
  tokensUsed: new Counter({
    name: 'opencode_tokens_total',
    help: 'Total tokens used',
    labelNames: ['stack_name', 'model', 'direction']
  }),
  toolInvocations: new Counter({
    name: 'opencode_tool_invocations_total',
    help: 'Tool invocations',
    labelNames: ['stack_name', 'tool', 'success']
  }),
  responseDuration: new Histogram({
    name: 'opencode_response_duration_seconds',
    help: 'AI response duration',
    labelNames: ['stack_name', 'model'],
    buckets: [0.5, 1, 2, 5, 10, 30, 60, 120]
  }),
  activeSessions: new Gauge({
    name: 'opencode_active_sessions',
    help: 'Currently active sessions',
    labelNames: ['stack_name']
  })
 };
 ```
 #### 3.3.2 Expose Metrics Endpoint
 ```typescript
 // Add to container
 app.get('/metrics', async (req, res) => {
  res.set('Content-Type', register.contentType);
  res.send(await register.metrics());
 });
 ```
 ### Phase 4: Central Logging Infrastructure (Week 4)
 #### 3.4.1 Deploy Logging Stack
 ```yaml
 # docker-compose.logging.yml
 services:
  loki:
    image: grafana/loki:latest
    ports:
      - "3100:3100"
    volumes:
      - loki-data:/loki
  promtail:
    image: grafana/promtail:latest
    volumes:
      - /var/log:/var/log:ro
      - ./promtail-config.yml:/etc/promtail/config.yml
  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus-data:/prometheus
  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
    volumes:
      - grafana-data:/var/lib/grafana
 ```
 #### 3.4.2 Prometheus Scrape Config
 ```yaml
 # prometheus.yml
 scrape_configs:
  - job_name: 'ai-stacks'
    dns_sd_configs:
      - names:
          - 'tasks.ai-stack-*'
        type: 'A'
        port: 9090
    relabel_configs:
      - source_labels: [__meta_dns_name]
        target_label: stack_name
 ```
 ---
 ## 4. Data Schema
 ### 4.1 Event Log Schema (JSON Lines)
 ```json
 {
  "timestamp": "2026-01-10T12:00:00.000Z",
  "stack_name": "john-dev",
  "session_id": "sess_abc123",
  "event_type": "message|tool_use|session_start|session_end|error",
  "data": {
    "role": "user|assistant",
    "model": "glm-4.7-free",
    "agent": "sisyphus",
    "tool": "bash",
    "tokens_in": 1500,
    "tokens_out": 500,
    "duration_ms": 2340,
    "success": true,
    "error_code": null
  }
 }
 ```
 ### 4.2 Metrics Labels
 | Metric | Labels |
 |--------|--------|
 | `opencode_*` | `stack_name`, `model`, `agent`, `tool`, `success` |
 ---
 ## 5. Privacy & Security
 ### 5.1 Data Anonymization
 - **Prompts**: Hash content, store only length and word count
 - **File paths**: Anonymize to pattern (e.g., `/home/user/project/src/*.ts`)
 - **Bash commands**: Log command name only, not arguments with secrets
 - **Env vars**: Never log, redact from all outputs
 ### 5.2 Retention Policy
 | Data Type | Retention | Storage |
 |-----------|-----------|---------|
 | Raw logs | 7 days | Loki |
 | Aggregated metrics | 90 days | Prometheus |
 | Session summaries | 1 year | PostgreSQL |
 | Billing data | 7 years | PostgreSQL |
 ### 5.3 Access Control
 - Logs accessible only to platform admins
 - Users can request their own data export
 - Stack owners can view their stack's metrics in Grafana
 ---
 ## 6. Grafana Dashboards
 ### 6.1 Platform Overview
 - Total active stacks
 - Messages per hour (all stacks)
 - Token usage by model
 - Error rate
 - Top agents used
 ### 6.2 Per-Stack Dashboard
 - Session count over time
 - Token usage
 - Tool usage breakdown
 - Response time percentiles
 - Error log viewer
 ### 6.3 Alerts
 ```yaml
 # alerting-rules.yml
 groups:
  - name: ai-stack-alerts
    rules:
      - alert: StackUnhealthy
        expr: up{job="ai-stacks"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Stack {{ $labels.stack_name }} is down"
      - alert: HighErrorRate
        expr: rate(opencode_errors_total[5m]) > 0.1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.stack_name }}"
 ```
 ---
 ## 7. Implementation Checklist
 ### Phase 1: Container Logging
 - [ ] Set up Loki + Promtail on logging server
 - [ ] Configure Docker log driver for ai-stack containers
 - [ ] Add log rotation to Dockerfile
 - [ ] Verify logs flowing to Loki
 ### Phase 2: Session Logging
 - [ ] Create logging hook in oh-my-opencode
 - [ ] Define event schema
 - [ ] Implement log shipping (HTTP or file-based)
 - [ ] Add session/message/tool logging
 ### Phase 3: Metrics
 - [ ] Add prom-client to container
 - [ ] Expose /metrics endpoint
 - [ ] Configure Prometheus scraping
 - [ ] Create initial Grafana dashboards
 ### Phase 4: Production Hardening
 - [ ] Implement data anonymization
 - [ ] Set up retention policies
 - [ ] Configure alerts
 - [ ] Document runbooks
 ---
 ## 8. Cost Estimates
 | Component | Resource | Monthly Cost |
 |-----------|----------|--------------|
 | Loki | 50GB logs @ 7 days | ~$15 |
 | Prometheus | 10GB metrics @ 90 days | ~$10 |
 | Grafana | 1 instance | Free (OSS) |
 | Log ingestion | Network | ~$5 |
 | **Total** | | **~$30/month** |
 ---
 ## 9. Next Steps
 1. **Approve plan** - Review and confirm approach
 2. **Deploy logging infra** - Loki/Prometheus/Grafana on dedicated server
 3. **Modify Dockerfile** - Add logging configuration
 4. **Create oh-my-opencode hooks** - Session/message/tool logging
 5. **Build dashboards** - Grafana visualizations
 6. **Test with pilot stack** - Validate before rollout
 7. **Rollout to all stacks** - Update deployer to include logging config
--- a/logging-stack/alerting/ai-stack-alerts.yml
+++ b/logging-stack/alerting/ai-stack-alerts.yml
@@ -0,0 +1,62 @@
 groups:
  - name: ai-stack-alerts
    rules:
      - alert: StackUnhealthy
        expr: up{job="ai-stacks"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Stack {{ $labels.stack_name }} is down"
          description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
      - alert: HighErrorRate
        expr: |
          sum by (stack_name) (rate(opencode_errors_total[5m])) 
          / 
          sum by (stack_name) (rate(opencode_messages_total[5m])) 
          > 0.1
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.stack_name }}"
          description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
      - alert: NoActivity
        expr: |
          time() - opencode_last_activity_timestamp > 3600
        for: 5m
        labels:
          severity: info
        annotations:
          summary: "No activity on {{ $labels.stack_name }}"
          description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
      - alert: HighTokenUsage
        expr: |
          sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High token usage on {{ $labels.stack_name }}"
          description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
      - alert: LogIngestDown
        expr: up{job="log-ingest"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Log ingest service is down"
          description: "The central log ingest service has been down for more than 2 minutes."
      - alert: LokiDown
        expr: up{job="loki"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Loki is down"
          description: "Loki log aggregation service has been down for more than 2 minutes."
--- a/logging-stack/config/grafana/provisioning/dashboards/dashboards.yml
+++ b/logging-stack/config/grafana/provisioning/dashboards/dashboards.yml
@@ -0,0 +1,11 @@
 apiVersion: 1
 providers:
  - name: 'AI Stack Dashboards'
    orgId: 1
    folder: 'AI Stacks'
    type: file
    disableDeletion: false
    editable: true
    options:
      path: /var/lib/grafana/dashboards
--- a/logging-stack/config/grafana/provisioning/datasources/datasources.yml
+++ b/logging-stack/config/grafana/provisioning/datasources/datasources.yml
@@ -0,0 +1,17 @@
 apiVersion: 1
 datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false
  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    editable: false
    jsonData:
      maxLines: 1000
--- a/logging-stack/config/loki-config.yml
+++ b/logging-stack/config/loki-config.yml
@@ -0,0 +1,51 @@
 auth_enabled: false
 server:
  http_listen_port: 3100
  grpc_listen_port: 9096
 common:
  instance_addr: 127.0.0.1
  path_prefix: /loki
  storage:
    filesystem:
      chunks_directory: /loki/chunks
      rules_directory: /loki/rules
  replication_factor: 1
  ring:
    kvstore:
      store: inmemory
 query_range:
  results_cache:
    cache:
      embedded_cache:
        enabled: true
        max_size_mb: 100
 schema_config:
  configs:
    - from: 2020-10-24
      store: boltdb-shipper
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h
 ruler:
  alertmanager_url: http://localhost:9093
 limits_config:
  retention_period: 168h
  ingestion_rate_mb: 10
  ingestion_burst_size_mb: 20
  max_streams_per_user: 10000
  max_line_size: 256kb
 compactor:
  working_directory: /loki/compactor
  shared_store: filesystem
  retention_enabled: true
  retention_delete_delay: 2h
  retention_delete_worker_count: 150
--- a/logging-stack/config/prometheus.yml
+++ b/logging-stack/config/prometheus.yml
@@ -0,0 +1,62 @@
 global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    monitor: 'ai-stack-monitor'
 alerting:
  alertmanagers:
    - static_configs:
        - targets: []
 rule_files:
  - /etc/prometheus/alerting/*.yml
 scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
  - job_name: 'loki'
    static_configs:
      - targets: ['loki:3100']
  - job_name: 'log-ingest'
    static_configs:
      - targets: ['log-ingest:3000']
  - job_name: 'ai-stacks'
    docker_sd_configs:
      - host: unix:///var/run/docker.sock
        refresh_interval: 30s
    relabel_configs:
      - source_labels: [__meta_docker_container_name]
        regex: '/(ai-stack-.*|app-.*opencode.*)'
        action: keep
      - source_labels: [__meta_docker_container_name]
        regex: '/?(.*)'
        target_label: container
      - source_labels: [__meta_docker_port_private]
        regex: '9090'
        action: keep
      - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
        target_label: service
      - source_labels: [__meta_docker_container_label_stack_name]
        target_label: stack_name
      - source_labels: [__meta_docker_container_name]
        regex: '.*opencode-([a-z0-9-]+).*'
        replacement: '${1}'
        target_label: stack_name
      - source_labels: [__meta_docker_container_name]
        regex: '.*ai-stack-([a-z0-9-]+).*'
        replacement: '${1}'
        target_label: stack_name
      - target_label: __address__
        replacement: '${1}:9090'
        source_labels: [__meta_docker_container_network_ip]
  - job_name: 'ai-stacks-static'
    file_sd_configs:
      - files:
          - /etc/prometheus/targets/*.json
        refresh_interval: 30s
--- a/logging-stack/config/promtail-config.yml
+++ b/logging-stack/config/promtail-config.yml
@@ -0,0 +1,71 @@
 server:
  http_listen_port: 9080
  grpc_listen_port: 0
 positions:
  filename: /tmp/positions.yaml
 clients:
  - url: http://loki:3100/loki/api/v1/push
 scrape_configs:
  - job_name: docker
    docker_sd_configs:
      - host: unix:///var/run/docker.sock
        refresh_interval: 5s
    relabel_configs:
      - source_labels: ['__meta_docker_container_name']
        regex: '/(.*)'
        target_label: 'container'
      - source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
        target_label: 'service'
      - source_labels: ['__meta_docker_container_label_com_docker_compose_project']
        target_label: 'project'
      - source_labels: ['__meta_docker_container_name']
        regex: '/?(ai-stack-.*|app-.*opencode.*)'
        action: keep
      - source_labels: ['__meta_docker_container_label_stack_name']
        target_label: 'stack_name'
      - source_labels: ['__meta_docker_container_name']
        regex: '.*opencode-([a-z0-9-]+).*'
        target_label: 'stack_name'
      - source_labels: ['__meta_docker_container_name']
        regex: '.*ai-stack-([a-z0-9-]+).*'
        target_label: 'stack_name'
    pipeline_stages:
      - json:
          expressions:
            output: log
            stream: stream
            timestamp: time
      - labels:
          stream:
      - timestamp:
          source: timestamp
          format: RFC3339Nano
      - output:
          source: output
  - job_name: ai-stack-events
    static_configs:
      - targets:
          - localhost
        labels:
          job: ai-stack-events
          __path__: /var/log/ai-stack/*.jsonl
    pipeline_stages:
      - json:
          expressions:
            stack_name: stack_name
            session_id: session_id
            event_type: event_type
            model: data.model
            agent: data.agent
            tool: data.tool
      - labels:
          stack_name:
          session_id:
          event_type:
          model:
          agent:
          tool:
--- a/logging-stack/dashboards/ai-stack-overview.json
+++ b/logging-stack/dashboards/ai-stack-overview.json
@@ -0,0 +1,508 @@
 {
  "annotations": {
    "list": []
  },
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
      "id": 1,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "expr": "count(opencode_active_sessions{stack_name=~\"$stack_name\"})",
          "legendFormat": "Active Sessions",
          "refId": "A"
        }
      ],
      "title": "Active Sessions",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
      "id": 2,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "expr": "sum(increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
          "legendFormat": "Total Messages",
          "refId": "A"
        }
      ],
      "title": "Messages (Period)",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
      "id": 3,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "expr": "sum(increase(opencode_tokens_total{stack_name=~\"$stack_name\"}[$__range]))",
          "legendFormat": "Total Tokens",
          "refId": "A"
        }
      ],
      "title": "Tokens Used (Period)",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null },
              { "color": "yellow", "value": 0.01 },
              { "color": "red", "value": 0.05 }
            ]
          },
          "unit": "percentunit"
        },
        "overrides": []
      },
      "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
      "id": 4,
      "options": {
        "colorMode": "value",
        "graphMode": "area",
        "justifyMode": "auto",
        "orientation": "auto",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "textMode": "auto"
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "expr": "sum(rate(opencode_errors_total{stack_name=~\"$stack_name\"}[5m])) / sum(rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
          "legendFormat": "Error Rate",
          "refId": "A"
        }
      ],
      "title": "Error Rate",
      "type": "stat"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": { "type": "linear" },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": { "group": "A", "mode": "none" },
            "thresholdsStyle": { "mode": "off" }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
      "id": 5,
      "options": {
        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
        "tooltip": { "mode": "single", "sort": "none" }
      },
      "targets": [
        {
          "expr": "sum by (stack_name) (rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
          "legendFormat": "{{stack_name}}",
          "refId": "A"
        }
      ],
      "title": "Messages per Second by Stack",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "custom": {
            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
            "axisPlacement": "auto",
            "barAlignment": 0,
            "drawStyle": "line",
            "fillOpacity": 10,
            "gradientMode": "none",
            "hideFrom": { "legend": false, "tooltip": false, "viz": false },
            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
            "scaleDistribution": { "type": "linear" },
            "showPoints": "auto",
            "spanNulls": false,
            "stacking": { "group": "A", "mode": "none" },
            "thresholdsStyle": { "mode": "off" }
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          },
          "unit": "short"
        },
        "overrides": []
      },
      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
      "id": 6,
      "options": {
        "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
        "tooltip": { "mode": "single", "sort": "none" }
      },
      "targets": [
        {
          "expr": "sum by (model) (rate(opencode_tokens_total{stack_name=~\"$stack_name\"}[5m]))",
          "legendFormat": "{{model}}",
          "refId": "A"
        }
      ],
      "title": "Token Usage by Model",
      "type": "timeseries"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
      "id": 7,
      "options": {
        "legend": { "displayMode": "list", "placement": "right", "showLegend": true },
        "pieType": "pie",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "tooltip": { "mode": "single", "sort": "none" }
      },
      "targets": [
        {
          "expr": "sum by (tool) (increase(opencode_tool_invocations_total{stack_name=~\"$stack_name\"}[$__range]))",
          "legendFormat": "{{tool}}",
          "refId": "A"
        }
      ],
      "title": "Tool Usage Distribution",
      "type": "piechart"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "palette-classic" },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
      "id": 8,
      "options": {
        "legend": { "displayMode": "list", "placement": "right", "showLegend": true },
        "pieType": "pie",
        "reduceOptions": {
          "calcs": ["lastNotNull"],
          "fields": "",
          "values": false
        },
        "tooltip": { "mode": "single", "sort": "none" }
      },
      "targets": [
        {
          "expr": "sum by (agent) (increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
          "legendFormat": "{{agent}}",
          "refId": "A"
        }
      ],
      "title": "Agent Usage Distribution",
      "type": "piechart"
    },
    {
      "datasource": {
        "type": "prometheus",
        "uid": "prometheus"
      },
      "fieldConfig": {
        "defaults": {
          "color": { "mode": "thresholds" },
          "custom": {
            "align": "auto",
            "cellOptions": { "type": "auto" },
            "inspect": false
          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
            "steps": [
              { "color": "green", "value": null }
            ]
          }
        },
        "overrides": []
      },
      "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
      "id": 9,
      "options": {
        "cellHeight": "sm",
        "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
        "showHeader": true
      },
      "pluginVersion": "10.2.0",
      "targets": [
        {
          "expr": "topk(10, sum by (stack_name) (increase(opencode_messages_total[$__range])))",
          "format": "table",
          "instant": true,
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "Top 10 Active Stacks",
      "transformations": [
        {
          "id": "organize",
          "options": {
            "excludeByName": { "Time": true },
            "indexByName": {},
            "renameByName": { "Value": "Messages", "stack_name": "Stack Name" }
          }
        }
      ],
      "type": "table"
    },
    {
      "datasource": {
        "type": "loki",
        "uid": "loki"
      },
      "gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
      "id": 10,
      "options": {
        "dedupStrategy": "none",
        "enableLogDetails": true,
        "prettifyLogMessage": false,
        "showCommonLabels": false,
        "showLabels": false,
        "showTime": true,
        "sortOrder": "Descending",
        "wrapLogMessage": false
      },
      "targets": [
        {
          "expr": "{stack_name=~\"$stack_name\"} |= ``",
          "legendFormat": "",
          "refId": "A"
        }
      ],
      "title": "Live Logs",
      "type": "logs"
    }
  ],
  "refresh": "10s",
  "schemaVersion": 38,
  "tags": ["ai-stack", "monitoring"],
  "templating": {
    "list": [
      {
        "allValue": ".*",
        "current": {
          "selected": true,
          "text": "All",
          "value": "$__all"
        },
        "datasource": {
          "type": "prometheus",
          "uid": "prometheus"
        },
        "definition": "label_values(opencode_messages_total, stack_name)",
        "hide": 0,
        "includeAll": true,
        "label": "Stack Name",
        "multi": true,
        "name": "stack_name",
        "options": [],
        "query": {
          "query": "label_values(opencode_messages_total, stack_name)",
          "refId": "StandardVariableQuery"
        },
        "refresh": 2,
        "regex": "",
        "skipUrlSync": false,
        "sort": 1,
        "type": "query"
      }
    ]
  },
  "time": {
    "from": "now-1h",
    "to": "now"
  },
  "timepicker": {},
  "timezone": "",
  "title": "AI Stack Overview",
  "uid": "ai-stack-overview",
  "version": 1,
  "weekStart": ""
 }
--- a/logging-stack/docker-compose.yml
+++ b/logging-stack/docker-compose.yml
@@ -0,0 +1,138 @@
 version: "3.8"
 # AI Stack Logging Infrastructure
 # Loki (logs) + Prometheus (metrics) + Grafana (visualization)
 services:
  # =============================================================================
  # LOKI - Log Aggregation
  # =============================================================================
  loki:
    image: grafana/loki:2.9.0
    container_name: ai-stack-loki
    ports:
      - "3100:3100"
    volumes:
      - ./config/loki-config.yml:/etc/loki/local-config.yaml:ro
      - loki-data:/loki
    command: -config.file=/etc/loki/local-config.yaml
    networks:
      - logging-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
  # =============================================================================
  # PROMTAIL - Log Collector (ships logs to Loki)
  # =============================================================================
  promtail:
    image: grafana/promtail:2.9.0
    container_name: ai-stack-promtail
    volumes:
      - ./config/promtail-config.yml:/etc/promtail/config.yml:ro
      - /var/log:/var/log:ro
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
      - /var/run/docker.sock:/var/run/docker.sock:ro
    command: -config.file=/etc/promtail/config.yml
    networks:
      - logging-network
    depends_on:
      - loki
    restart: unless-stopped
  # =============================================================================
  # PROMETHEUS - Metrics Collection
  # =============================================================================
  prometheus:
    image: prom/prometheus:v2.47.0
    container_name: ai-stack-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./alerting:/etc/prometheus/alerting:ro
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=90d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    networks:
      - logging-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3
  # =============================================================================
  # GRAFANA - Visualization & Dashboards
  # =============================================================================
  grafana:
    image: grafana/grafana:10.2.0
    container_name: ai-stack-grafana
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
    volumes:
      - ./config/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./dashboards:/var/lib/grafana/dashboards:ro
      - grafana-data:/var/lib/grafana
    networks:
      - logging-network
    depends_on:
      - loki
      - prometheus
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3
  # =============================================================================
  # LOG INGEST API - Custom endpoint for AI stack events
  # =============================================================================
  log-ingest:
    build:
      context: ./log-ingest
      dockerfile: Dockerfile
    container_name: ai-stack-log-ingest
    ports:
      - "3102:3000"
    environment:
      - LOKI_URL=http://loki:3100
      - LOG_LEVEL=info
    networks:
      - logging-network
    depends_on:
      - loki
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
 networks:
  logging-network:
    driver: bridge
    name: ai-stack-logging
 volumes:
  loki-data:
    name: ai-stack-loki-data
  prometheus-data:
    name: ai-stack-prometheus-data
  grafana-data:
    name: ai-stack-grafana-data
--- a/logging-stack/log-ingest/Dockerfile
+++ b/logging-stack/log-ingest/Dockerfile
@@ -0,0 +1,13 @@
 FROM oven/bun:1.0
 WORKDIR /app
 COPY package.json bun.lockb* ./
 RUN bun install --frozen-lockfile 2>/dev/null || bun install
 COPY . .
 ENV PORT=3000
 EXPOSE 3000
 CMD ["bun", "run", "src/index.ts"]
--- a/logging-stack/log-ingest/package.json
+++ b/logging-stack/log-ingest/package.json
@@ -0,0 +1,16 @@
 {
  "name": "ai-stack-log-ingest",
  "version": "1.0.0",
  "type": "module",
  "scripts": {
    "start": "bun run src/index.ts",
    "dev": "bun --watch run src/index.ts"
  },
  "dependencies": {
    "hono": "^4.0.0",
    "prom-client": "^15.0.0"
  },
  "devDependencies": {
    "@types/bun": "latest"
  }
 }
--- a/logging-stack/log-ingest/src/index.ts
+++ b/logging-stack/log-ingest/src/index.ts
@@ -0,0 +1,199 @@
 import { Hono } from 'hono';
 import { cors } from 'hono/cors';
 import { logger } from 'hono/logger';
 import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';
 const app = new Hono();
 const register = new Registry();
 collectDefaultMetrics({ register });
 const metrics = {
  eventsReceived: new Counter({
    name: 'log_ingest_events_total',
    help: 'Total events received',
    labelNames: ['stack_name', 'event_type'],
    registers: [register]
  }),
  eventProcessingDuration: new Histogram({
    name: 'log_ingest_processing_duration_seconds',
    help: 'Event processing duration',
    labelNames: ['stack_name'],
    buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
    registers: [register]
  }),
  lokiPushErrors: new Counter({
    name: 'log_ingest_loki_errors_total',
    help: 'Loki push errors',
    registers: [register]
  }),
  activeStacks: new Gauge({
    name: 'log_ingest_active_stacks',
    help: 'Number of active stacks sending events',
    registers: [register]
  })
 };
 const LOKI_URL = process.env.LOKI_URL || 'http://loki:3100';
 interface LogEvent {
  timestamp?: string;
  stack_name: string;
  session_id?: string;
  event_type: 'session_start' | 'session_end' | 'message' | 'tool_use' | 'error' | 'mcp_connect' | 'mcp_disconnect';
  data?: {
    role?: 'user' | 'assistant' | 'system';
    model?: string;
    agent?: string;
    tool?: string;
    tokens_in?: number;
    tokens_out?: number;
    duration_ms?: number;
    success?: boolean;
    error_code?: string;
    error_message?: string;
    content_length?: number;
    content_hash?: string;
    mcp_server?: string;
  };
 }
 const activeStacksSet = new Set<string>();
 async function pushToLoki(events: LogEvent[]): Promise<void> {
  const streams: Record<string, { stream: Record<string, string>; values: [string, string][] }> = {};
  for (const event of events) {
    const labels = {
      job: 'ai-stack-events',
      stack_name: event.stack_name,
      event_type: event.event_type,
      ...(event.session_id && { session_id: event.session_id }),
      ...(event.data?.model && { model: event.data.model }),
      ...(event.data?.agent && { agent: event.data.agent }),
      ...(event.data?.tool && { tool: event.data.tool })
    };
    const labelKey = JSON.stringify(labels);
    if (!streams[labelKey]) {
      streams[labelKey] = {
        stream: labels,
        values: []
      };
    }
    const timestamp = event.timestamp || new Date().toISOString();
    const nanoseconds = BigInt(new Date(timestamp).getTime()) * BigInt(1_000_000);
    streams[labelKey].values.push([
      nanoseconds.toString(),
      JSON.stringify(event)
    ]);
  }
  const payload = {
    streams: Object.values(streams)
  };
  const response = await fetch(`${LOKI_URL}/loki/api/v1/push`, {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json'
    },
    body: JSON.stringify(payload)
  });
  if (!response.ok) {
    const text = await response.text();
    throw new Error(`Loki push failed: ${response.status} ${text}`);
  }
 }
 app.use('*', cors());
 app.use('*', logger());
 app.get('/health', (c) => {
  return c.json({ status: 'healthy', timestamp: new Date().toISOString() });
 });
 app.get('/metrics', async (c) => {
  metrics.activeStacks.set(activeStacksSet.size);
  c.header('Content-Type', register.contentType);
  return c.text(await register.metrics());
 });
 app.post('/ingest', async (c) => {
  const startTime = Date.now();
  try {
    const body = await c.req.json();
    const events: LogEvent[] = Array.isArray(body) ? body : [body];
    for (const event of events) {
      if (!event.stack_name || !event.event_type) {
        return c.json({ error: 'Missing required fields: stack_name, event_type' }, 400);
      }
      activeStacksSet.add(event.stack_name);
      metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
    }
    await pushToLoki(events);
    const duration = (Date.now() - startTime) / 1000;
    for (const event of events) {
      metrics.eventProcessingDuration.observe({ stack_name: event.stack_name }, duration);
    }
    return c.json({ success: true, count: events.length });
  } catch (error) {
    metrics.lokiPushErrors.inc();
    console.error('Ingest error:', error);
    return c.json({ error: 'Failed to process events', details: String(error) }, 500);
  }
 });
 app.post('/ingest/batch', async (c) => {
  const startTime = Date.now();
  try {
    const body = await c.req.json();
    if (!Array.isArray(body)) {
      return c.json({ error: 'Expected array of events' }, 400);
    }
    const events: LogEvent[] = body;
    for (const event of events) {
      if (!event.stack_name || !event.event_type) {
        continue;
      }
      activeStacksSet.add(event.stack_name);
      metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
    }
    await pushToLoki(events);
    const duration = (Date.now() - startTime) / 1000;
    metrics.eventProcessingDuration.observe({ stack_name: 'batch' }, duration);
    return c.json({ success: true, count: events.length });
  } catch (error) {
    metrics.lokiPushErrors.inc();
    console.error('Batch ingest error:', error);
    return c.json({ error: 'Failed to process batch', details: String(error) }, 500);
  }
 });
 const port = parseInt(process.env.PORT || '3000');
 console.log(`Log ingest service starting on port ${port}`);
 export default {
  port,
  fetch: app.fetch
 };
--- a/logging-stack/log-ingest/tsconfig.json
+++ b/logging-stack/log-ingest/tsconfig.json
@@ -0,0 +1,12 @@
 {
  "compilerOptions": {
    "target": "ESNext",
    "module": "ESNext",
    "moduleResolution": "bundler",
    "types": ["bun-types"],
    "esModuleInterop": true,
    "strict": true,
    "skipLibCheck": true,
    "noEmit": true
  }
 }
--- a/src/api/dokploy-production.ts
+++ b/src/api/dokploy-production.ts
@@ -388,6 +388,16 @@ export class DokployProductionClient {
    );
  }
  async setApplicationEnv(applicationId: string, env: string): Promise<void> {
    await this.request(
      'POST',
      '/application.update',
      { applicationId, env },
      'application',
      'set-env'
    );
  }
  async getApplication(applicationId: string): Promise<DokployApplication> {
    return this.request<DokployApplication>(
      'GET',
--- a/src/api/dokploy.ts
+++ b/src/api/dokploy.ts
@@ -149,6 +149,20 @@ export class DokployClient {
    } satisfies CreateDomainRequest);
  }
  async setApplicationEnv(applicationId: string, env: string): Promise<void> {
    await this.request('POST', '/application.update', { 
      applicationId, 
      env 
    });
  }
  async addApplicationLabel(applicationId: string, key: string, value: string): Promise<void> {
    await this.request('POST', '/application.update', { 
      applicationId,
      dockerLabels: `${key}=${value}`
    });
  }
  async deployApplication(applicationId: string): Promise<void> {
    await this.request('POST', '/application.deploy', { applicationId });
  }
--- a/src/orchestrator/production-deployer.ts
+++ b/src/orchestrator/production-deployer.ts
@@ -280,6 +280,18 @@ export class ProductionDeployer {
      registryId: config.registryId,
    });
    state.progress = 52;
    state.message = 'Setting environment variables for logging';
    const envVars = [
      `STACK_NAME=${config.stackName}`,
      `USAGE_LOGGING_ENABLED=true`,
      `LOG_INGEST_URL=${process.env.LOG_INGEST_URL || 'http://10.100.0.20:3102/ingest'}`,
      `METRICS_PORT=9090`,
    ].join('\n');
    await this.client.setApplicationEnv(state.resources.applicationId, envVars);
    state.progress = 55;
    state.message = 'Creating persistent storage';