diff --git a/docs/LOGGING-PLAN.md b/docs/LOGGING-PLAN.md new file mode 100644 index 0000000..038df22 --- /dev/null +++ b/docs/LOGGING-PLAN.md @@ -0,0 +1,435 @@ +# AI Stack Monitoring & Logging Plan + +## Overview + +Comprehensive logging strategy for all deployed AI stacks at `*.ai.flexinit.nl` to enable: +- Usage analytics and billing +- Debugging and support +- Security auditing +- Performance optimization +- User behavior insights + +--- + +## 1. Log Categories + +### 1.1 System Logs +| Log Type | Source | Content | +|----------|--------|---------| +| Container stdout/stderr | Docker | OpenCode server output, errors, startup | +| Health checks | Docker | Container health status over time | +| Resource metrics | cAdvisor/Prometheus | CPU, memory, network, disk I/O | + +### 1.2 OpenCode Server Logs +| Log Type | Source | Content | +|----------|--------|---------| +| Server events | `--print-logs` | HTTP requests, WebSocket connections | +| Session lifecycle | OpenCode | Session start/end, duration | +| Tool invocations | OpenCode | Which tools used, success/failure | +| MCP connections | OpenCode | MCP server connects/disconnects | + +### 1.3 AI Interaction Logs +| Log Type | Source | Content | +|----------|--------|---------| +| Prompts | OpenCode session | User messages (anonymized) | +| Responses | OpenCode session | AI responses (summarized) | +| Token usage | Provider API | Input/output tokens per request | +| Model selection | OpenCode | Which model used per request | +| Agent selection | oh-my-opencode | Which agent (Sisyphus, Oracle, etc.) | + +### 1.4 User Activity Logs +| Log Type | Source | Content | +|----------|--------|---------| +| File operations | OpenCode tools | Read/write/edit actions | +| Bash commands | OpenCode tools | Commands executed | +| Git operations | OpenCode tools | Commits, pushes, branches | +| Web fetches | OpenCode tools | URLs accessed | + +--- + +## 2. Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ AI Stack Container │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ +│ │ OpenCode │ │ Fluent Bit │ │ OpenTelemetry SDK │ │ +│ │ Server │──│ (sidecar) │──│ (instrumentation) │ │ +│ └──────────────┘ └──────┬───────┘ └──────────┬───────────┘ │ +└────────────────────────────┼────────────────────┼───────────────┘ + │ │ + ▼ ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Central Logging Stack │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │ +│ │ Loki │ │ Prometheus │ │ Tempo │ │ +│ │ (logs) │ │ (metrics) │ │ (traces) │ │ +│ └──────┬───────┘ └──────┬───────┘ └──────────┬───────────┘ │ +│ └─────────────────┼─────────────────────┘ │ +│ ▼ │ +│ ┌──────────────┐ │ +│ │ Grafana │ │ +│ │ (dashboard) │ │ +│ └──────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +--- + +## 3. Implementation Plan + +### Phase 1: Container Logging (Week 1) + +#### 3.1.1 Docker Log Driver +```yaml +# docker-compose addition for each stack +logging: + driver: "fluentd" + options: + fluentd-address: "10.100.0.x:24224" + tag: "ai-stack.{{.Name}}" + fluentd-async: "true" +``` + +#### 3.1.2 OpenCode Server Logs +Modify Dockerfile CMD to capture structured logs: +```dockerfile +CMD ["sh", "-c", "opencode serve --hostname 0.0.0.0 --port 8080 --mdns --print-logs --log-level INFO 2>&1 | tee /var/log/opencode/server.log"] +``` + +#### 3.1.3 Log Rotation +```dockerfile +# Add logrotate config +RUN apt-get install -y logrotate +COPY logrotate.conf /etc/logrotate.d/opencode +``` + +### Phase 2: Session & Prompt Logging (Week 2) + +#### 3.2.1 OpenCode Plugin for Logging +Create logging hook in oh-my-opencode: + +```typescript +// src/hooks/logging.ts +export const loggingHook: Hook = { + name: 'session-logger', + + onSessionStart: async (session) => { + await logEvent({ + type: 'session_start', + stackName: process.env.STACK_NAME, + sessionId: session.id, + timestamp: new Date().toISOString() + }); + }, + + onMessage: async (message, session) => { + await logEvent({ + type: 'message', + stackName: process.env.STACK_NAME, + sessionId: session.id, + role: message.role, + // Hash content for privacy, log length + contentHash: hash(message.content), + contentLength: message.content.length, + model: session.model, + agent: session.agent, + timestamp: new Date().toISOString() + }); + }, + + onToolUse: async (tool, args, result, session) => { + await logEvent({ + type: 'tool_use', + stackName: process.env.STACK_NAME, + sessionId: session.id, + tool: tool.name, + argsHash: hash(JSON.stringify(args)), + success: !result.error, + duration: result.duration, + timestamp: new Date().toISOString() + }); + } +}; +``` + +#### 3.2.2 Log Destination Options + +**Option A: Centralized HTTP Endpoint** +```typescript +async function logEvent(event: LogEvent) { + await fetch('https://logs.ai.flexinit.nl/ingest', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'X-Stack-Name': process.env.STACK_NAME, + 'X-API-Key': process.env.LOGGING_API_KEY + }, + body: JSON.stringify(event) + }); +} +``` + +**Option B: Local File + Fluent Bit** +```typescript +async function logEvent(event: LogEvent) { + const logLine = JSON.stringify(event) + '\n'; + await fs.appendFile('/var/log/opencode/events.jsonl', logLine); +} +``` + +### Phase 3: Metrics Collection (Week 3) + +#### 3.3.1 Prometheus Metrics Endpoint +Add to OpenCode container: + +```typescript +// metrics.ts +import { register, Counter, Histogram, Gauge } from 'prom-client'; + +export const metrics = { + sessionsTotal: new Counter({ + name: 'opencode_sessions_total', + help: 'Total number of sessions', + labelNames: ['stack_name'] + }), + + messagesTotal: new Counter({ + name: 'opencode_messages_total', + help: 'Total messages processed', + labelNames: ['stack_name', 'role', 'model', 'agent'] + }), + + tokensUsed: new Counter({ + name: 'opencode_tokens_total', + help: 'Total tokens used', + labelNames: ['stack_name', 'model', 'direction'] + }), + + toolInvocations: new Counter({ + name: 'opencode_tool_invocations_total', + help: 'Tool invocations', + labelNames: ['stack_name', 'tool', 'success'] + }), + + responseDuration: new Histogram({ + name: 'opencode_response_duration_seconds', + help: 'AI response duration', + labelNames: ['stack_name', 'model'], + buckets: [0.5, 1, 2, 5, 10, 30, 60, 120] + }), + + activeSessions: new Gauge({ + name: 'opencode_active_sessions', + help: 'Currently active sessions', + labelNames: ['stack_name'] + }) +}; +``` + +#### 3.3.2 Expose Metrics Endpoint +```typescript +// Add to container +app.get('/metrics', async (req, res) => { + res.set('Content-Type', register.contentType); + res.send(await register.metrics()); +}); +``` + +### Phase 4: Central Logging Infrastructure (Week 4) + +#### 3.4.1 Deploy Logging Stack +```yaml +# docker-compose.logging.yml +services: + loki: + image: grafana/loki:latest + ports: + - "3100:3100" + volumes: + - loki-data:/loki + + promtail: + image: grafana/promtail:latest + volumes: + - /var/log:/var/log:ro + - ./promtail-config.yml:/etc/promtail/config.yml + + prometheus: + image: prom/prometheus:latest + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - prometheus-data:/prometheus + + grafana: + image: grafana/grafana:latest + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD} + volumes: + - grafana-data:/var/lib/grafana +``` + +#### 3.4.2 Prometheus Scrape Config +```yaml +# prometheus.yml +scrape_configs: + - job_name: 'ai-stacks' + dns_sd_configs: + - names: + - 'tasks.ai-stack-*' + type: 'A' + port: 9090 + relabel_configs: + - source_labels: [__meta_dns_name] + target_label: stack_name +``` + +--- + +## 4. Data Schema + +### 4.1 Event Log Schema (JSON Lines) +```json +{ + "timestamp": "2026-01-10T12:00:00.000Z", + "stack_name": "john-dev", + "session_id": "sess_abc123", + "event_type": "message|tool_use|session_start|session_end|error", + "data": { + "role": "user|assistant", + "model": "glm-4.7-free", + "agent": "sisyphus", + "tool": "bash", + "tokens_in": 1500, + "tokens_out": 500, + "duration_ms": 2340, + "success": true, + "error_code": null + } +} +``` + +### 4.2 Metrics Labels +| Metric | Labels | +|--------|--------| +| `opencode_*` | `stack_name`, `model`, `agent`, `tool`, `success` | + +--- + +## 5. Privacy & Security + +### 5.1 Data Anonymization +- **Prompts**: Hash content, store only length and word count +- **File paths**: Anonymize to pattern (e.g., `/home/user/project/src/*.ts`) +- **Bash commands**: Log command name only, not arguments with secrets +- **Env vars**: Never log, redact from all outputs + +### 5.2 Retention Policy +| Data Type | Retention | Storage | +|-----------|-----------|---------| +| Raw logs | 7 days | Loki | +| Aggregated metrics | 90 days | Prometheus | +| Session summaries | 1 year | PostgreSQL | +| Billing data | 7 years | PostgreSQL | + +### 5.3 Access Control +- Logs accessible only to platform admins +- Users can request their own data export +- Stack owners can view their stack's metrics in Grafana + +--- + +## 6. Grafana Dashboards + +### 6.1 Platform Overview +- Total active stacks +- Messages per hour (all stacks) +- Token usage by model +- Error rate +- Top agents used + +### 6.2 Per-Stack Dashboard +- Session count over time +- Token usage +- Tool usage breakdown +- Response time percentiles +- Error log viewer + +### 6.3 Alerts +```yaml +# alerting-rules.yml +groups: + - name: ai-stack-alerts + rules: + - alert: StackUnhealthy + expr: up{job="ai-stacks"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Stack {{ $labels.stack_name }} is down" + + - alert: HighErrorRate + expr: rate(opencode_errors_total[5m]) > 0.1 + for: 10m + labels: + severity: warning + annotations: + summary: "High error rate on {{ $labels.stack_name }}" +``` + +--- + +## 7. Implementation Checklist + +### Phase 1: Container Logging +- [ ] Set up Loki + Promtail on logging server +- [ ] Configure Docker log driver for ai-stack containers +- [ ] Add log rotation to Dockerfile +- [ ] Verify logs flowing to Loki + +### Phase 2: Session Logging +- [ ] Create logging hook in oh-my-opencode +- [ ] Define event schema +- [ ] Implement log shipping (HTTP or file-based) +- [ ] Add session/message/tool logging + +### Phase 3: Metrics +- [ ] Add prom-client to container +- [ ] Expose /metrics endpoint +- [ ] Configure Prometheus scraping +- [ ] Create initial Grafana dashboards + +### Phase 4: Production Hardening +- [ ] Implement data anonymization +- [ ] Set up retention policies +- [ ] Configure alerts +- [ ] Document runbooks + +--- + +## 8. Cost Estimates + +| Component | Resource | Monthly Cost | +|-----------|----------|--------------| +| Loki | 50GB logs @ 7 days | ~$15 | +| Prometheus | 10GB metrics @ 90 days | ~$10 | +| Grafana | 1 instance | Free (OSS) | +| Log ingestion | Network | ~$5 | +| **Total** | | **~$30/month** | + +--- + +## 9. Next Steps + +1. **Approve plan** - Review and confirm approach +2. **Deploy logging infra** - Loki/Prometheus/Grafana on dedicated server +3. **Modify Dockerfile** - Add logging configuration +4. **Create oh-my-opencode hooks** - Session/message/tool logging +5. **Build dashboards** - Grafana visualizations +6. **Test with pilot stack** - Validate before rollout +7. **Rollout to all stacks** - Update deployer to include logging config diff --git a/logging-stack/alerting/ai-stack-alerts.yml b/logging-stack/alerting/ai-stack-alerts.yml new file mode 100644 index 0000000..097f7b6 --- /dev/null +++ b/logging-stack/alerting/ai-stack-alerts.yml @@ -0,0 +1,62 @@ +groups: + - name: ai-stack-alerts + rules: + - alert: StackUnhealthy + expr: up{job="ai-stacks"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Stack {{ $labels.stack_name }} is down" + description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes." + + - alert: HighErrorRate + expr: | + sum by (stack_name) (rate(opencode_errors_total[5m])) + / + sum by (stack_name) (rate(opencode_messages_total[5m])) + > 0.1 + for: 10m + labels: + severity: warning + annotations: + summary: "High error rate on {{ $labels.stack_name }}" + description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes." + + - alert: NoActivity + expr: | + time() - opencode_last_activity_timestamp > 3600 + for: 5m + labels: + severity: info + annotations: + summary: "No activity on {{ $labels.stack_name }}" + description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour." + + - alert: HighTokenUsage + expr: | + sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000 + for: 5m + labels: + severity: warning + annotations: + summary: "High token usage on {{ $labels.stack_name }}" + description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour." + + - alert: LogIngestDown + expr: up{job="log-ingest"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Log ingest service is down" + description: "The central log ingest service has been down for more than 2 minutes." + + - alert: LokiDown + expr: up{job="loki"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Loki is down" + description: "Loki log aggregation service has been down for more than 2 minutes." diff --git a/logging-stack/config/grafana/provisioning/dashboards/dashboards.yml b/logging-stack/config/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 0000000..d551e20 --- /dev/null +++ b/logging-stack/config/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'AI Stack Dashboards' + orgId: 1 + folder: 'AI Stacks' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards diff --git a/logging-stack/config/grafana/provisioning/datasources/datasources.yml b/logging-stack/config/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 0000000..ab4c7af --- /dev/null +++ b/logging-stack/config/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,17 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + maxLines: 1000 diff --git a/logging-stack/config/loki-config.yml b/logging-stack/config/loki-config.yml new file mode 100644 index 0000000..d9d7345 --- /dev/null +++ b/logging-stack/config/loki-config.yml @@ -0,0 +1,51 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2020-10-24 + store: boltdb-shipper + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 24h + +ruler: + alertmanager_url: http://localhost:9093 + +limits_config: + retention_period: 168h + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + max_streams_per_user: 10000 + max_line_size: 256kb + +compactor: + working_directory: /loki/compactor + shared_store: filesystem + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 diff --git a/logging-stack/config/prometheus.yml b/logging-stack/config/prometheus.yml new file mode 100644 index 0000000..c3badc7 --- /dev/null +++ b/logging-stack/config/prometheus.yml @@ -0,0 +1,62 @@ +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + monitor: 'ai-stack-monitor' + +alerting: + alertmanagers: + - static_configs: + - targets: [] + +rule_files: + - /etc/prometheus/alerting/*.yml + +scrape_configs: + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + + - job_name: 'log-ingest' + static_configs: + - targets: ['log-ingest:3000'] + + - job_name: 'ai-stacks' + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 30s + relabel_configs: + - source_labels: [__meta_docker_container_name] + regex: '/(ai-stack-.*|app-.*opencode.*)' + action: keep + - source_labels: [__meta_docker_container_name] + regex: '/?(.*)' + target_label: container + - source_labels: [__meta_docker_port_private] + regex: '9090' + action: keep + - source_labels: [__meta_docker_container_label_com_docker_swarm_service_name] + target_label: service + - source_labels: [__meta_docker_container_label_stack_name] + target_label: stack_name + - source_labels: [__meta_docker_container_name] + regex: '.*opencode-([a-z0-9-]+).*' + replacement: '${1}' + target_label: stack_name + - source_labels: [__meta_docker_container_name] + regex: '.*ai-stack-([a-z0-9-]+).*' + replacement: '${1}' + target_label: stack_name + - target_label: __address__ + replacement: '${1}:9090' + source_labels: [__meta_docker_container_network_ip] + + - job_name: 'ai-stacks-static' + file_sd_configs: + - files: + - /etc/prometheus/targets/*.json + refresh_interval: 30s diff --git a/logging-stack/config/promtail-config.yml b/logging-stack/config/promtail-config.yml new file mode 100644 index 0000000..cd98d2e --- /dev/null +++ b/logging-stack/config/promtail-config.yml @@ -0,0 +1,71 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + relabel_configs: + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + - source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name'] + target_label: 'service' + - source_labels: ['__meta_docker_container_label_com_docker_compose_project'] + target_label: 'project' + - source_labels: ['__meta_docker_container_name'] + regex: '/?(ai-stack-.*|app-.*opencode.*)' + action: keep + - source_labels: ['__meta_docker_container_label_stack_name'] + target_label: 'stack_name' + - source_labels: ['__meta_docker_container_name'] + regex: '.*opencode-([a-z0-9-]+).*' + target_label: 'stack_name' + - source_labels: ['__meta_docker_container_name'] + regex: '.*ai-stack-([a-z0-9-]+).*' + target_label: 'stack_name' + pipeline_stages: + - json: + expressions: + output: log + stream: stream + timestamp: time + - labels: + stream: + - timestamp: + source: timestamp + format: RFC3339Nano + - output: + source: output + + - job_name: ai-stack-events + static_configs: + - targets: + - localhost + labels: + job: ai-stack-events + __path__: /var/log/ai-stack/*.jsonl + pipeline_stages: + - json: + expressions: + stack_name: stack_name + session_id: session_id + event_type: event_type + model: data.model + agent: data.agent + tool: data.tool + - labels: + stack_name: + session_id: + event_type: + model: + agent: + tool: diff --git a/logging-stack/dashboards/ai-stack-overview.json b/logging-stack/dashboards/ai-stack-overview.json new file mode 100644 index 0000000..90768ca --- /dev/null +++ b/logging-stack/dashboards/ai-stack-overview.json @@ -0,0 +1,508 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "expr": "count(opencode_active_sessions{stack_name=~\"$stack_name\"})", + "legendFormat": "Active Sessions", + "refId": "A" + } + ], + "title": "Active Sessions", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "expr": "sum(increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))", + "legendFormat": "Total Messages", + "refId": "A" + } + ], + "title": "Messages (Period)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "expr": "sum(increase(opencode_tokens_total{stack_name=~\"$stack_name\"}[$__range]))", + "legendFormat": "Total Tokens", + "refId": "A" + } + ], + "title": "Tokens Used (Period)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 0.01 }, + { "color": "red", "value": 0.05 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "expr": "sum(rate(opencode_errors_total{stack_name=~\"$stack_name\"}[5m])) / sum(rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 5, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "expr": "sum by (stack_name) (rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))", + "legendFormat": "{{stack_name}}", + "refId": "A" + } + ], + "title": "Messages per Second by Stack", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "expr": "sum by (model) (rate(opencode_tokens_total{stack_name=~\"$stack_name\"}[5m]))", + "legendFormat": "{{model}}", + "refId": "A" + } + ], + "title": "Token Usage by Model", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 }, + "id": 7, + "options": { + "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "expr": "sum by (tool) (increase(opencode_tool_invocations_total{stack_name=~\"$stack_name\"}[$__range]))", + "legendFormat": "{{tool}}", + "refId": "A" + } + ], + "title": "Tool Usage Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 }, + "id": 8, + "options": { + "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, + "pieType": "pie", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { + "expr": "sum by (agent) (increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))", + "legendFormat": "{{agent}}", + "refId": "A" + } + ], + "title": "Agent Usage Distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { + "align": "auto", + "cellOptions": { "type": "auto" }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 }, + "id": 9, + "options": { + "cellHeight": "sm", + "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, + "showHeader": true + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "expr": "topk(10, sum by (stack_name) (increase(opencode_messages_total[$__range])))", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Top 10 Active Stacks", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { "Time": true }, + "indexByName": {}, + "renameByName": { "Value": "Messages", "stack_name": "Stack Name" } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 }, + "id": 10, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { + "expr": "{stack_name=~\"$stack_name\"} |= ``", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Live Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 38, + "tags": ["ai-stack", "monitoring"], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(opencode_messages_total, stack_name)", + "hide": 0, + "includeAll": true, + "label": "Stack Name", + "multi": true, + "name": "stack_name", + "options": [], + "query": { + "query": "label_values(opencode_messages_total, stack_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "AI Stack Overview", + "uid": "ai-stack-overview", + "version": 1, + "weekStart": "" +} diff --git a/logging-stack/docker-compose.yml b/logging-stack/docker-compose.yml new file mode 100644 index 0000000..7ef6eec --- /dev/null +++ b/logging-stack/docker-compose.yml @@ -0,0 +1,138 @@ +version: "3.8" + +# AI Stack Logging Infrastructure +# Loki (logs) + Prometheus (metrics) + Grafana (visualization) + +services: + # ============================================================================= + # LOKI - Log Aggregation + # ============================================================================= + loki: + image: grafana/loki:2.9.0 + container_name: ai-stack-loki + ports: + - "3100:3100" + volumes: + - ./config/loki-config.yml:/etc/loki/local-config.yaml:ro + - loki-data:/loki + command: -config.file=/etc/loki/local-config.yaml + networks: + - logging-network + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + # ============================================================================= + # PROMTAIL - Log Collector (ships logs to Loki) + # ============================================================================= + promtail: + image: grafana/promtail:2.9.0 + container_name: ai-stack-promtail + volumes: + - ./config/promtail-config.yml:/etc/promtail/config.yml:ro + - /var/log:/var/log:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + command: -config.file=/etc/promtail/config.yml + networks: + - logging-network + depends_on: + - loki + restart: unless-stopped + + # ============================================================================= + # PROMETHEUS - Metrics Collection + # ============================================================================= + prometheus: + image: prom/prometheus:v2.47.0 + container_name: ai-stack-prometheus + ports: + - "9090:9090" + volumes: + - ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./alerting:/etc/prometheus/alerting:ro + - prometheus-data:/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=90d' + - '--web.enable-lifecycle' + - '--web.enable-admin-api' + networks: + - logging-network + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + + # ============================================================================= + # GRAFANA - Visualization & Dashboards + # ============================================================================= + grafana: + image: grafana/grafana:10.2.0 + container_name: ai-stack-grafana + ports: + - "3001:3000" + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} + - GF_USERS_ALLOW_SIGN_UP=false + - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001} + - GF_INSTALL_PLUGINS=grafana-piechart-panel + volumes: + - ./config/grafana/provisioning:/etc/grafana/provisioning:ro + - ./dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + networks: + - logging-network + depends_on: + - loki + - prometheus + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + + # ============================================================================= + # LOG INGEST API - Custom endpoint for AI stack events + # ============================================================================= + log-ingest: + build: + context: ./log-ingest + dockerfile: Dockerfile + container_name: ai-stack-log-ingest + ports: + - "3102:3000" + environment: + - LOKI_URL=http://loki:3100 + - LOG_LEVEL=info + networks: + - logging-network + depends_on: + - loki + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"] + interval: 30s + timeout: 10s + retries: 3 + +networks: + logging-network: + driver: bridge + name: ai-stack-logging + +volumes: + loki-data: + name: ai-stack-loki-data + prometheus-data: + name: ai-stack-prometheus-data + grafana-data: + name: ai-stack-grafana-data diff --git a/logging-stack/log-ingest/Dockerfile b/logging-stack/log-ingest/Dockerfile new file mode 100644 index 0000000..240e3fb --- /dev/null +++ b/logging-stack/log-ingest/Dockerfile @@ -0,0 +1,13 @@ +FROM oven/bun:1.0 + +WORKDIR /app + +COPY package.json bun.lockb* ./ +RUN bun install --frozen-lockfile 2>/dev/null || bun install + +COPY . . + +ENV PORT=3000 +EXPOSE 3000 + +CMD ["bun", "run", "src/index.ts"] diff --git a/logging-stack/log-ingest/package.json b/logging-stack/log-ingest/package.json new file mode 100644 index 0000000..ac765a9 --- /dev/null +++ b/logging-stack/log-ingest/package.json @@ -0,0 +1,16 @@ +{ + "name": "ai-stack-log-ingest", + "version": "1.0.0", + "type": "module", + "scripts": { + "start": "bun run src/index.ts", + "dev": "bun --watch run src/index.ts" + }, + "dependencies": { + "hono": "^4.0.0", + "prom-client": "^15.0.0" + }, + "devDependencies": { + "@types/bun": "latest" + } +} diff --git a/logging-stack/log-ingest/src/index.ts b/logging-stack/log-ingest/src/index.ts new file mode 100644 index 0000000..14256bf --- /dev/null +++ b/logging-stack/log-ingest/src/index.ts @@ -0,0 +1,199 @@ +import { Hono } from 'hono'; +import { cors } from 'hono/cors'; +import { logger } from 'hono/logger'; +import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client'; + +const app = new Hono(); +const register = new Registry(); + +collectDefaultMetrics({ register }); + +const metrics = { + eventsReceived: new Counter({ + name: 'log_ingest_events_total', + help: 'Total events received', + labelNames: ['stack_name', 'event_type'], + registers: [register] + }), + + eventProcessingDuration: new Histogram({ + name: 'log_ingest_processing_duration_seconds', + help: 'Event processing duration', + labelNames: ['stack_name'], + buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1], + registers: [register] + }), + + lokiPushErrors: new Counter({ + name: 'log_ingest_loki_errors_total', + help: 'Loki push errors', + registers: [register] + }), + + activeStacks: new Gauge({ + name: 'log_ingest_active_stacks', + help: 'Number of active stacks sending events', + registers: [register] + }) +}; + +const LOKI_URL = process.env.LOKI_URL || 'http://loki:3100'; + +interface LogEvent { + timestamp?: string; + stack_name: string; + session_id?: string; + event_type: 'session_start' | 'session_end' | 'message' | 'tool_use' | 'error' | 'mcp_connect' | 'mcp_disconnect'; + data?: { + role?: 'user' | 'assistant' | 'system'; + model?: string; + agent?: string; + tool?: string; + tokens_in?: number; + tokens_out?: number; + duration_ms?: number; + success?: boolean; + error_code?: string; + error_message?: string; + content_length?: number; + content_hash?: string; + mcp_server?: string; + }; +} + +const activeStacksSet = new Set(); + +async function pushToLoki(events: LogEvent[]): Promise { + const streams: Record; values: [string, string][] }> = {}; + + for (const event of events) { + const labels = { + job: 'ai-stack-events', + stack_name: event.stack_name, + event_type: event.event_type, + ...(event.session_id && { session_id: event.session_id }), + ...(event.data?.model && { model: event.data.model }), + ...(event.data?.agent && { agent: event.data.agent }), + ...(event.data?.tool && { tool: event.data.tool }) + }; + + const labelKey = JSON.stringify(labels); + + if (!streams[labelKey]) { + streams[labelKey] = { + stream: labels, + values: [] + }; + } + + const timestamp = event.timestamp || new Date().toISOString(); + const nanoseconds = BigInt(new Date(timestamp).getTime()) * BigInt(1_000_000); + + streams[labelKey].values.push([ + nanoseconds.toString(), + JSON.stringify(event) + ]); + } + + const payload = { + streams: Object.values(streams) + }; + + const response = await fetch(`${LOKI_URL}/loki/api/v1/push`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json' + }, + body: JSON.stringify(payload) + }); + + if (!response.ok) { + const text = await response.text(); + throw new Error(`Loki push failed: ${response.status} ${text}`); + } +} + +app.use('*', cors()); +app.use('*', logger()); + +app.get('/health', (c) => { + return c.json({ status: 'healthy', timestamp: new Date().toISOString() }); +}); + +app.get('/metrics', async (c) => { + metrics.activeStacks.set(activeStacksSet.size); + c.header('Content-Type', register.contentType); + return c.text(await register.metrics()); +}); + +app.post('/ingest', async (c) => { + const startTime = Date.now(); + + try { + const body = await c.req.json(); + const events: LogEvent[] = Array.isArray(body) ? body : [body]; + + for (const event of events) { + if (!event.stack_name || !event.event_type) { + return c.json({ error: 'Missing required fields: stack_name, event_type' }, 400); + } + + activeStacksSet.add(event.stack_name); + metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type }); + } + + await pushToLoki(events); + + const duration = (Date.now() - startTime) / 1000; + for (const event of events) { + metrics.eventProcessingDuration.observe({ stack_name: event.stack_name }, duration); + } + + return c.json({ success: true, count: events.length }); + } catch (error) { + metrics.lokiPushErrors.inc(); + console.error('Ingest error:', error); + return c.json({ error: 'Failed to process events', details: String(error) }, 500); + } +}); + +app.post('/ingest/batch', async (c) => { + const startTime = Date.now(); + + try { + const body = await c.req.json(); + + if (!Array.isArray(body)) { + return c.json({ error: 'Expected array of events' }, 400); + } + + const events: LogEvent[] = body; + + for (const event of events) { + if (!event.stack_name || !event.event_type) { + continue; + } + activeStacksSet.add(event.stack_name); + metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type }); + } + + await pushToLoki(events); + + const duration = (Date.now() - startTime) / 1000; + metrics.eventProcessingDuration.observe({ stack_name: 'batch' }, duration); + + return c.json({ success: true, count: events.length }); + } catch (error) { + metrics.lokiPushErrors.inc(); + console.error('Batch ingest error:', error); + return c.json({ error: 'Failed to process batch', details: String(error) }, 500); + } +}); + +const port = parseInt(process.env.PORT || '3000'); +console.log(`Log ingest service starting on port ${port}`); + +export default { + port, + fetch: app.fetch +}; diff --git a/logging-stack/log-ingest/tsconfig.json b/logging-stack/log-ingest/tsconfig.json new file mode 100644 index 0000000..2eba838 --- /dev/null +++ b/logging-stack/log-ingest/tsconfig.json @@ -0,0 +1,12 @@ +{ + "compilerOptions": { + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "types": ["bun-types"], + "esModuleInterop": true, + "strict": true, + "skipLibCheck": true, + "noEmit": true + } +} diff --git a/src/api/dokploy-production.ts b/src/api/dokploy-production.ts index 0dd0ada..5587e93 100644 --- a/src/api/dokploy-production.ts +++ b/src/api/dokploy-production.ts @@ -388,6 +388,16 @@ export class DokployProductionClient { ); } + async setApplicationEnv(applicationId: string, env: string): Promise { + await this.request( + 'POST', + '/application.update', + { applicationId, env }, + 'application', + 'set-env' + ); + } + async getApplication(applicationId: string): Promise { return this.request( 'GET', diff --git a/src/api/dokploy.ts b/src/api/dokploy.ts index c7b50f8..29e8a2d 100644 --- a/src/api/dokploy.ts +++ b/src/api/dokploy.ts @@ -149,6 +149,20 @@ export class DokployClient { } satisfies CreateDomainRequest); } + async setApplicationEnv(applicationId: string, env: string): Promise { + await this.request('POST', '/application.update', { + applicationId, + env + }); + } + + async addApplicationLabel(applicationId: string, key: string, value: string): Promise { + await this.request('POST', '/application.update', { + applicationId, + dockerLabels: `${key}=${value}` + }); + } + async deployApplication(applicationId: string): Promise { await this.request('POST', '/application.deploy', { applicationId }); } diff --git a/src/orchestrator/production-deployer.ts b/src/orchestrator/production-deployer.ts index 45baa9b..575e874 100644 --- a/src/orchestrator/production-deployer.ts +++ b/src/orchestrator/production-deployer.ts @@ -280,6 +280,18 @@ export class ProductionDeployer { registryId: config.registryId, }); + state.progress = 52; + state.message = 'Setting environment variables for logging'; + + const envVars = [ + `STACK_NAME=${config.stackName}`, + `USAGE_LOGGING_ENABLED=true`, + `LOG_INGEST_URL=${process.env.LOG_INGEST_URL || 'http://10.100.0.20:3102/ingest'}`, + `METRICS_PORT=9090`, + ].join('\n'); + + await this.client.setApplicationEnv(state.resources.applicationId, envVars); + state.progress = 55; state.message = 'Creating persistent storage';