feat: add comprehensive logging infrastructure
- Add Loki/Prometheus/Grafana stack in logging-stack/ - Add log-ingest service for receiving events from AI stacks - Add Grafana dashboard with stack_name filtering - Update Dokploy client with setApplicationEnv method - Configure STACK_NAME env var for deployed stacks - Add alerting rules for stack health monitoring
This commit is contained in:
435
docs/LOGGING-PLAN.md
Normal file
435
docs/LOGGING-PLAN.md
Normal file
@@ -0,0 +1,435 @@
|
||||
# AI Stack Monitoring & Logging Plan
|
||||
|
||||
## Overview
|
||||
|
||||
Comprehensive logging strategy for all deployed AI stacks at `*.ai.flexinit.nl` to enable:
|
||||
- Usage analytics and billing
|
||||
- Debugging and support
|
||||
- Security auditing
|
||||
- Performance optimization
|
||||
- User behavior insights
|
||||
|
||||
---
|
||||
|
||||
## 1. Log Categories
|
||||
|
||||
### 1.1 System Logs
|
||||
| Log Type | Source | Content |
|
||||
|----------|--------|---------|
|
||||
| Container stdout/stderr | Docker | OpenCode server output, errors, startup |
|
||||
| Health checks | Docker | Container health status over time |
|
||||
| Resource metrics | cAdvisor/Prometheus | CPU, memory, network, disk I/O |
|
||||
|
||||
### 1.2 OpenCode Server Logs
|
||||
| Log Type | Source | Content |
|
||||
|----------|--------|---------|
|
||||
| Server events | `--print-logs` | HTTP requests, WebSocket connections |
|
||||
| Session lifecycle | OpenCode | Session start/end, duration |
|
||||
| Tool invocations | OpenCode | Which tools used, success/failure |
|
||||
| MCP connections | OpenCode | MCP server connects/disconnects |
|
||||
|
||||
### 1.3 AI Interaction Logs
|
||||
| Log Type | Source | Content |
|
||||
|----------|--------|---------|
|
||||
| Prompts | OpenCode session | User messages (anonymized) |
|
||||
| Responses | OpenCode session | AI responses (summarized) |
|
||||
| Token usage | Provider API | Input/output tokens per request |
|
||||
| Model selection | OpenCode | Which model used per request |
|
||||
| Agent selection | oh-my-opencode | Which agent (Sisyphus, Oracle, etc.) |
|
||||
|
||||
### 1.4 User Activity Logs
|
||||
| Log Type | Source | Content |
|
||||
|----------|--------|---------|
|
||||
| File operations | OpenCode tools | Read/write/edit actions |
|
||||
| Bash commands | OpenCode tools | Commands executed |
|
||||
| Git operations | OpenCode tools | Commits, pushes, branches |
|
||||
| Web fetches | OpenCode tools | URLs accessed |
|
||||
|
||||
---
|
||||
|
||||
## 2. Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ AI Stack Container │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │ OpenCode │ │ Fluent Bit │ │ OpenTelemetry SDK │ │
|
||||
│ │ Server │──│ (sidecar) │──│ (instrumentation) │ │
|
||||
│ └──────────────┘ └──────┬───────┘ └──────────┬───────────┘ │
|
||||
└────────────────────────────┼────────────────────┼───────────────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Central Logging Stack │
|
||||
│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────┐ │
|
||||
│ │ Loki │ │ Prometheus │ │ Tempo │ │
|
||||
│ │ (logs) │ │ (metrics) │ │ (traces) │ │
|
||||
│ └──────┬───────┘ └──────┬───────┘ └──────────┬───────────┘ │
|
||||
│ └─────────────────┼─────────────────────┘ │
|
||||
│ ▼ │
|
||||
│ ┌──────────────┐ │
|
||||
│ │ Grafana │ │
|
||||
│ │ (dashboard) │ │
|
||||
│ └──────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Implementation Plan
|
||||
|
||||
### Phase 1: Container Logging (Week 1)
|
||||
|
||||
#### 3.1.1 Docker Log Driver
|
||||
```yaml
|
||||
# docker-compose addition for each stack
|
||||
logging:
|
||||
driver: "fluentd"
|
||||
options:
|
||||
fluentd-address: "10.100.0.x:24224"
|
||||
tag: "ai-stack.{{.Name}}"
|
||||
fluentd-async: "true"
|
||||
```
|
||||
|
||||
#### 3.1.2 OpenCode Server Logs
|
||||
Modify Dockerfile CMD to capture structured logs:
|
||||
```dockerfile
|
||||
CMD ["sh", "-c", "opencode serve --hostname 0.0.0.0 --port 8080 --mdns --print-logs --log-level INFO 2>&1 | tee /var/log/opencode/server.log"]
|
||||
```
|
||||
|
||||
#### 3.1.3 Log Rotation
|
||||
```dockerfile
|
||||
# Add logrotate config
|
||||
RUN apt-get install -y logrotate
|
||||
COPY logrotate.conf /etc/logrotate.d/opencode
|
||||
```
|
||||
|
||||
### Phase 2: Session & Prompt Logging (Week 2)
|
||||
|
||||
#### 3.2.1 OpenCode Plugin for Logging
|
||||
Create logging hook in oh-my-opencode:
|
||||
|
||||
```typescript
|
||||
// src/hooks/logging.ts
|
||||
export const loggingHook: Hook = {
|
||||
name: 'session-logger',
|
||||
|
||||
onSessionStart: async (session) => {
|
||||
await logEvent({
|
||||
type: 'session_start',
|
||||
stackName: process.env.STACK_NAME,
|
||||
sessionId: session.id,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
},
|
||||
|
||||
onMessage: async (message, session) => {
|
||||
await logEvent({
|
||||
type: 'message',
|
||||
stackName: process.env.STACK_NAME,
|
||||
sessionId: session.id,
|
||||
role: message.role,
|
||||
// Hash content for privacy, log length
|
||||
contentHash: hash(message.content),
|
||||
contentLength: message.content.length,
|
||||
model: session.model,
|
||||
agent: session.agent,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
},
|
||||
|
||||
onToolUse: async (tool, args, result, session) => {
|
||||
await logEvent({
|
||||
type: 'tool_use',
|
||||
stackName: process.env.STACK_NAME,
|
||||
sessionId: session.id,
|
||||
tool: tool.name,
|
||||
argsHash: hash(JSON.stringify(args)),
|
||||
success: !result.error,
|
||||
duration: result.duration,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
};
|
||||
```
|
||||
|
||||
#### 3.2.2 Log Destination Options
|
||||
|
||||
**Option A: Centralized HTTP Endpoint**
|
||||
```typescript
|
||||
async function logEvent(event: LogEvent) {
|
||||
await fetch('https://logs.ai.flexinit.nl/ingest', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
'X-Stack-Name': process.env.STACK_NAME,
|
||||
'X-API-Key': process.env.LOGGING_API_KEY
|
||||
},
|
||||
body: JSON.stringify(event)
|
||||
});
|
||||
}
|
||||
```
|
||||
|
||||
**Option B: Local File + Fluent Bit**
|
||||
```typescript
|
||||
async function logEvent(event: LogEvent) {
|
||||
const logLine = JSON.stringify(event) + '\n';
|
||||
await fs.appendFile('/var/log/opencode/events.jsonl', logLine);
|
||||
}
|
||||
```
|
||||
|
||||
### Phase 3: Metrics Collection (Week 3)
|
||||
|
||||
#### 3.3.1 Prometheus Metrics Endpoint
|
||||
Add to OpenCode container:
|
||||
|
||||
```typescript
|
||||
// metrics.ts
|
||||
import { register, Counter, Histogram, Gauge } from 'prom-client';
|
||||
|
||||
export const metrics = {
|
||||
sessionsTotal: new Counter({
|
||||
name: 'opencode_sessions_total',
|
||||
help: 'Total number of sessions',
|
||||
labelNames: ['stack_name']
|
||||
}),
|
||||
|
||||
messagesTotal: new Counter({
|
||||
name: 'opencode_messages_total',
|
||||
help: 'Total messages processed',
|
||||
labelNames: ['stack_name', 'role', 'model', 'agent']
|
||||
}),
|
||||
|
||||
tokensUsed: new Counter({
|
||||
name: 'opencode_tokens_total',
|
||||
help: 'Total tokens used',
|
||||
labelNames: ['stack_name', 'model', 'direction']
|
||||
}),
|
||||
|
||||
toolInvocations: new Counter({
|
||||
name: 'opencode_tool_invocations_total',
|
||||
help: 'Tool invocations',
|
||||
labelNames: ['stack_name', 'tool', 'success']
|
||||
}),
|
||||
|
||||
responseDuration: new Histogram({
|
||||
name: 'opencode_response_duration_seconds',
|
||||
help: 'AI response duration',
|
||||
labelNames: ['stack_name', 'model'],
|
||||
buckets: [0.5, 1, 2, 5, 10, 30, 60, 120]
|
||||
}),
|
||||
|
||||
activeSessions: new Gauge({
|
||||
name: 'opencode_active_sessions',
|
||||
help: 'Currently active sessions',
|
||||
labelNames: ['stack_name']
|
||||
})
|
||||
};
|
||||
```
|
||||
|
||||
#### 3.3.2 Expose Metrics Endpoint
|
||||
```typescript
|
||||
// Add to container
|
||||
app.get('/metrics', async (req, res) => {
|
||||
res.set('Content-Type', register.contentType);
|
||||
res.send(await register.metrics());
|
||||
});
|
||||
```
|
||||
|
||||
### Phase 4: Central Logging Infrastructure (Week 4)
|
||||
|
||||
#### 3.4.1 Deploy Logging Stack
|
||||
```yaml
|
||||
# docker-compose.logging.yml
|
||||
services:
|
||||
loki:
|
||||
image: grafana/loki:latest
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- loki-data:/loki
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:latest
|
||||
volumes:
|
||||
- /var/log:/var/log:ro
|
||||
- ./promtail-config.yml:/etc/promtail/config.yml
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
- prometheus-data:/prometheus
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||
volumes:
|
||||
- grafana-data:/var/lib/grafana
|
||||
```
|
||||
|
||||
#### 3.4.2 Prometheus Scrape Config
|
||||
```yaml
|
||||
# prometheus.yml
|
||||
scrape_configs:
|
||||
- job_name: 'ai-stacks'
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- 'tasks.ai-stack-*'
|
||||
type: 'A'
|
||||
port: 9090
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_dns_name]
|
||||
target_label: stack_name
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Data Schema
|
||||
|
||||
### 4.1 Event Log Schema (JSON Lines)
|
||||
```json
|
||||
{
|
||||
"timestamp": "2026-01-10T12:00:00.000Z",
|
||||
"stack_name": "john-dev",
|
||||
"session_id": "sess_abc123",
|
||||
"event_type": "message|tool_use|session_start|session_end|error",
|
||||
"data": {
|
||||
"role": "user|assistant",
|
||||
"model": "glm-4.7-free",
|
||||
"agent": "sisyphus",
|
||||
"tool": "bash",
|
||||
"tokens_in": 1500,
|
||||
"tokens_out": 500,
|
||||
"duration_ms": 2340,
|
||||
"success": true,
|
||||
"error_code": null
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4.2 Metrics Labels
|
||||
| Metric | Labels |
|
||||
|--------|--------|
|
||||
| `opencode_*` | `stack_name`, `model`, `agent`, `tool`, `success` |
|
||||
|
||||
---
|
||||
|
||||
## 5. Privacy & Security
|
||||
|
||||
### 5.1 Data Anonymization
|
||||
- **Prompts**: Hash content, store only length and word count
|
||||
- **File paths**: Anonymize to pattern (e.g., `/home/user/project/src/*.ts`)
|
||||
- **Bash commands**: Log command name only, not arguments with secrets
|
||||
- **Env vars**: Never log, redact from all outputs
|
||||
|
||||
### 5.2 Retention Policy
|
||||
| Data Type | Retention | Storage |
|
||||
|-----------|-----------|---------|
|
||||
| Raw logs | 7 days | Loki |
|
||||
| Aggregated metrics | 90 days | Prometheus |
|
||||
| Session summaries | 1 year | PostgreSQL |
|
||||
| Billing data | 7 years | PostgreSQL |
|
||||
|
||||
### 5.3 Access Control
|
||||
- Logs accessible only to platform admins
|
||||
- Users can request their own data export
|
||||
- Stack owners can view their stack's metrics in Grafana
|
||||
|
||||
---
|
||||
|
||||
## 6. Grafana Dashboards
|
||||
|
||||
### 6.1 Platform Overview
|
||||
- Total active stacks
|
||||
- Messages per hour (all stacks)
|
||||
- Token usage by model
|
||||
- Error rate
|
||||
- Top agents used
|
||||
|
||||
### 6.2 Per-Stack Dashboard
|
||||
- Session count over time
|
||||
- Token usage
|
||||
- Tool usage breakdown
|
||||
- Response time percentiles
|
||||
- Error log viewer
|
||||
|
||||
### 6.3 Alerts
|
||||
```yaml
|
||||
# alerting-rules.yml
|
||||
groups:
|
||||
- name: ai-stack-alerts
|
||||
rules:
|
||||
- alert: StackUnhealthy
|
||||
expr: up{job="ai-stacks"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Stack {{ $labels.stack_name }} is down"
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: rate(opencode_errors_total[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.stack_name }}"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Implementation Checklist
|
||||
|
||||
### Phase 1: Container Logging
|
||||
- [ ] Set up Loki + Promtail on logging server
|
||||
- [ ] Configure Docker log driver for ai-stack containers
|
||||
- [ ] Add log rotation to Dockerfile
|
||||
- [ ] Verify logs flowing to Loki
|
||||
|
||||
### Phase 2: Session Logging
|
||||
- [ ] Create logging hook in oh-my-opencode
|
||||
- [ ] Define event schema
|
||||
- [ ] Implement log shipping (HTTP or file-based)
|
||||
- [ ] Add session/message/tool logging
|
||||
|
||||
### Phase 3: Metrics
|
||||
- [ ] Add prom-client to container
|
||||
- [ ] Expose /metrics endpoint
|
||||
- [ ] Configure Prometheus scraping
|
||||
- [ ] Create initial Grafana dashboards
|
||||
|
||||
### Phase 4: Production Hardening
|
||||
- [ ] Implement data anonymization
|
||||
- [ ] Set up retention policies
|
||||
- [ ] Configure alerts
|
||||
- [ ] Document runbooks
|
||||
|
||||
---
|
||||
|
||||
## 8. Cost Estimates
|
||||
|
||||
| Component | Resource | Monthly Cost |
|
||||
|-----------|----------|--------------|
|
||||
| Loki | 50GB logs @ 7 days | ~$15 |
|
||||
| Prometheus | 10GB metrics @ 90 days | ~$10 |
|
||||
| Grafana | 1 instance | Free (OSS) |
|
||||
| Log ingestion | Network | ~$5 |
|
||||
| **Total** | | **~$30/month** |
|
||||
|
||||
---
|
||||
|
||||
## 9. Next Steps
|
||||
|
||||
1. **Approve plan** - Review and confirm approach
|
||||
2. **Deploy logging infra** - Loki/Prometheus/Grafana on dedicated server
|
||||
3. **Modify Dockerfile** - Add logging configuration
|
||||
4. **Create oh-my-opencode hooks** - Session/message/tool logging
|
||||
5. **Build dashboards** - Grafana visualizations
|
||||
6. **Test with pilot stack** - Validate before rollout
|
||||
7. **Rollout to all stacks** - Update deployer to include logging config
|
||||
62
logging-stack/alerting/ai-stack-alerts.yml
Normal file
62
logging-stack/alerting/ai-stack-alerts.yml
Normal file
@@ -0,0 +1,62 @@
|
||||
groups:
|
||||
- name: ai-stack-alerts
|
||||
rules:
|
||||
- alert: StackUnhealthy
|
||||
expr: up{job="ai-stacks"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Stack {{ $labels.stack_name }} is down"
|
||||
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum by (stack_name) (rate(opencode_errors_total[5m]))
|
||||
/
|
||||
sum by (stack_name) (rate(opencode_messages_total[5m]))
|
||||
> 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
|
||||
|
||||
- alert: NoActivity
|
||||
expr: |
|
||||
time() - opencode_last_activity_timestamp > 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "No activity on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
|
||||
|
||||
- alert: HighTokenUsage
|
||||
expr: |
|
||||
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High token usage on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
|
||||
|
||||
- alert: LogIngestDown
|
||||
expr: up{job="log-ingest"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Log ingest service is down"
|
||||
description: "The central log ingest service has been down for more than 2 minutes."
|
||||
|
||||
- alert: LokiDown
|
||||
expr: up{job="loki"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Loki is down"
|
||||
description: "Loki log aggregation service has been down for more than 2 minutes."
|
||||
@@ -0,0 +1,11 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'AI Stack Dashboards'
|
||||
orgId: 1
|
||||
folder: 'AI Stacks'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://prometheus:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
editable: false
|
||||
jsonData:
|
||||
maxLines: 1000
|
||||
51
logging-stack/config/loki-config.yml
Normal file
51
logging-stack/config/loki-config.yml
Normal file
@@ -0,0 +1,51 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-10-24
|
||||
store: boltdb-shipper
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: 168h
|
||||
ingestion_rate_mb: 10
|
||||
ingestion_burst_size_mb: 20
|
||||
max_streams_per_user: 10000
|
||||
max_line_size: 256kb
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
shared_store: filesystem
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
62
logging-stack/config/prometheus.yml
Normal file
62
logging-stack/config/prometheus.yml
Normal file
@@ -0,0 +1,62 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
external_labels:
|
||||
monitor: 'ai-stack-monitor'
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: []
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alerting/*.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
|
||||
- job_name: 'loki'
|
||||
static_configs:
|
||||
- targets: ['loki:3100']
|
||||
|
||||
- job_name: 'log-ingest'
|
||||
static_configs:
|
||||
- targets: ['log-ingest:3000']
|
||||
|
||||
- job_name: 'ai-stacks'
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 30s
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: '/(ai-stack-.*|app-.*opencode.*)'
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: '/?(.*)'
|
||||
target_label: container
|
||||
- source_labels: [__meta_docker_port_private]
|
||||
regex: '9090'
|
||||
action: keep
|
||||
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
|
||||
target_label: service
|
||||
- source_labels: [__meta_docker_container_label_stack_name]
|
||||
target_label: stack_name
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: '.*opencode-([a-z0-9-]+).*'
|
||||
replacement: '${1}'
|
||||
target_label: stack_name
|
||||
- source_labels: [__meta_docker_container_name]
|
||||
regex: '.*ai-stack-([a-z0-9-]+).*'
|
||||
replacement: '${1}'
|
||||
target_label: stack_name
|
||||
- target_label: __address__
|
||||
replacement: '${1}:9090'
|
||||
source_labels: [__meta_docker_container_network_ip]
|
||||
|
||||
- job_name: 'ai-stacks-static'
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- /etc/prometheus/targets/*.json
|
||||
refresh_interval: 30s
|
||||
71
logging-stack/config/promtail-config.yml
Normal file
71
logging-stack/config/promtail-config.yml
Normal file
@@ -0,0 +1,71 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: docker
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 5s
|
||||
relabel_configs:
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container'
|
||||
- source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
|
||||
target_label: 'service'
|
||||
- source_labels: ['__meta_docker_container_label_com_docker_compose_project']
|
||||
target_label: 'project'
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/?(ai-stack-.*|app-.*opencode.*)'
|
||||
action: keep
|
||||
- source_labels: ['__meta_docker_container_label_stack_name']
|
||||
target_label: 'stack_name'
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '.*opencode-([a-z0-9-]+).*'
|
||||
target_label: 'stack_name'
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '.*ai-stack-([a-z0-9-]+).*'
|
||||
target_label: 'stack_name'
|
||||
pipeline_stages:
|
||||
- json:
|
||||
expressions:
|
||||
output: log
|
||||
stream: stream
|
||||
timestamp: time
|
||||
- labels:
|
||||
stream:
|
||||
- timestamp:
|
||||
source: timestamp
|
||||
format: RFC3339Nano
|
||||
- output:
|
||||
source: output
|
||||
|
||||
- job_name: ai-stack-events
|
||||
static_configs:
|
||||
- targets:
|
||||
- localhost
|
||||
labels:
|
||||
job: ai-stack-events
|
||||
__path__: /var/log/ai-stack/*.jsonl
|
||||
pipeline_stages:
|
||||
- json:
|
||||
expressions:
|
||||
stack_name: stack_name
|
||||
session_id: session_id
|
||||
event_type: event_type
|
||||
model: data.model
|
||||
agent: data.agent
|
||||
tool: data.tool
|
||||
- labels:
|
||||
stack_name:
|
||||
session_id:
|
||||
event_type:
|
||||
model:
|
||||
agent:
|
||||
tool:
|
||||
508
logging-stack/dashboards/ai-stack-overview.json
Normal file
508
logging-stack/dashboards/ai-stack-overview.json
Normal file
@@ -0,0 +1,508 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "count(opencode_active_sessions{stack_name=~\"$stack_name\"})",
|
||||
"legendFormat": "Active Sessions",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Active Sessions",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
|
||||
"legendFormat": "Total Messages",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Messages (Period)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(increase(opencode_tokens_total{stack_name=~\"$stack_name\"}[$__range]))",
|
||||
"legendFormat": "Total Tokens",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Tokens Used (Period)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.01 },
|
||||
{ "color": "red", "value": 0.05 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(opencode_errors_total{stack_name=~\"$stack_name\"}[5m])) / sum(rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
|
||||
"legendFormat": "Error Rate",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Error Rate",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (stack_name) (rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
|
||||
"legendFormat": "{{stack_name}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Messages per Second by Stack",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "auto",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (model) (rate(opencode_tokens_total{stack_name=~\"$stack_name\"}[5m]))",
|
||||
"legendFormat": "{{model}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Token Usage by Model",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (tool) (increase(opencode_tool_invocations_total{stack_name=~\"$stack_name\"}[$__range]))",
|
||||
"legendFormat": "{{tool}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Tool Usage Distribution",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
|
||||
"pieType": "pie",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"tooltip": { "mode": "single", "sort": "none" }
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (agent) (increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
|
||||
"legendFormat": "{{agent}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Agent Usage Distribution",
|
||||
"type": "piechart"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"custom": {
|
||||
"align": "auto",
|
||||
"cellOptions": { "type": "auto" },
|
||||
"inspect": false
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null }
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
|
||||
"id": 9,
|
||||
"options": {
|
||||
"cellHeight": "sm",
|
||||
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
|
||||
"showHeader": true
|
||||
},
|
||||
"pluginVersion": "10.2.0",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "topk(10, sum by (stack_name) (increase(opencode_messages_total[$__range])))",
|
||||
"format": "table",
|
||||
"instant": true,
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Top 10 Active Stacks",
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": { "Time": true },
|
||||
"indexByName": {},
|
||||
"renameByName": { "Value": "Messages", "stack_name": "Stack Name" }
|
||||
}
|
||||
}
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "loki"
|
||||
},
|
||||
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
|
||||
"id": 10,
|
||||
"options": {
|
||||
"dedupStrategy": "none",
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "{stack_name=~\"$stack_name\"} |= ``",
|
||||
"legendFormat": "",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Live Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "10s",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["ai-stack", "monitoring"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": ".*",
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
},
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"definition": "label_values(opencode_messages_total, stack_name)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Stack Name",
|
||||
"multi": true,
|
||||
"name": "stack_name",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(opencode_messages_total, stack_name)",
|
||||
"refId": "StandardVariableQuery"
|
||||
},
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "AI Stack Overview",
|
||||
"uid": "ai-stack-overview",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
138
logging-stack/docker-compose.yml
Normal file
138
logging-stack/docker-compose.yml
Normal file
@@ -0,0 +1,138 @@
|
||||
version: "3.8"
|
||||
|
||||
# AI Stack Logging Infrastructure
|
||||
# Loki (logs) + Prometheus (metrics) + Grafana (visualization)
|
||||
|
||||
services:
|
||||
# =============================================================================
|
||||
# LOKI - Log Aggregation
|
||||
# =============================================================================
|
||||
loki:
|
||||
image: grafana/loki:2.9.0
|
||||
container_name: ai-stack-loki
|
||||
ports:
|
||||
- "3100:3100"
|
||||
volumes:
|
||||
- ./config/loki-config.yml:/etc/loki/local-config.yaml:ro
|
||||
- loki-data:/loki
|
||||
command: -config.file=/etc/loki/local-config.yaml
|
||||
networks:
|
||||
- logging-network
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# PROMTAIL - Log Collector (ships logs to Loki)
|
||||
# =============================================================================
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: ai-stack-promtail
|
||||
volumes:
|
||||
- ./config/promtail-config.yml:/etc/promtail/config.yml:ro
|
||||
- /var/log:/var/log:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# PROMETHEUS - Metrics Collection
|
||||
# =============================================================================
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.47.0
|
||||
container_name: ai-stack-prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./alerting:/etc/prometheus/alerting:ro
|
||||
- prometheus-data:/prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=90d'
|
||||
- '--web.enable-lifecycle'
|
||||
- '--web.enable-admin-api'
|
||||
networks:
|
||||
- logging-network
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# GRAFANA - Visualization & Dashboards
|
||||
# =============================================================================
|
||||
grafana:
|
||||
image: grafana/grafana:10.2.0
|
||||
container_name: ai-stack-grafana
|
||||
ports:
|
||||
- "3001:3000"
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
|
||||
- GF_INSTALL_PLUGINS=grafana-piechart-panel
|
||||
volumes:
|
||||
- ./config/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana-data:/var/lib/grafana
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
- prometheus
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
# =============================================================================
|
||||
# LOG INGEST API - Custom endpoint for AI stack events
|
||||
# =============================================================================
|
||||
log-ingest:
|
||||
build:
|
||||
context: ./log-ingest
|
||||
dockerfile: Dockerfile
|
||||
container_name: ai-stack-log-ingest
|
||||
ports:
|
||||
- "3102:3000"
|
||||
environment:
|
||||
- LOKI_URL=http://loki:3100
|
||||
- LOG_LEVEL=info
|
||||
networks:
|
||||
- logging-network
|
||||
depends_on:
|
||||
- loki
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
|
||||
networks:
|
||||
logging-network:
|
||||
driver: bridge
|
||||
name: ai-stack-logging
|
||||
|
||||
volumes:
|
||||
loki-data:
|
||||
name: ai-stack-loki-data
|
||||
prometheus-data:
|
||||
name: ai-stack-prometheus-data
|
||||
grafana-data:
|
||||
name: ai-stack-grafana-data
|
||||
13
logging-stack/log-ingest/Dockerfile
Normal file
13
logging-stack/log-ingest/Dockerfile
Normal file
@@ -0,0 +1,13 @@
|
||||
FROM oven/bun:1.0
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY package.json bun.lockb* ./
|
||||
RUN bun install --frozen-lockfile 2>/dev/null || bun install
|
||||
|
||||
COPY . .
|
||||
|
||||
ENV PORT=3000
|
||||
EXPOSE 3000
|
||||
|
||||
CMD ["bun", "run", "src/index.ts"]
|
||||
16
logging-stack/log-ingest/package.json
Normal file
16
logging-stack/log-ingest/package.json
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
"name": "ai-stack-log-ingest",
|
||||
"version": "1.0.0",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"start": "bun run src/index.ts",
|
||||
"dev": "bun --watch run src/index.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"hono": "^4.0.0",
|
||||
"prom-client": "^15.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest"
|
||||
}
|
||||
}
|
||||
199
logging-stack/log-ingest/src/index.ts
Normal file
199
logging-stack/log-ingest/src/index.ts
Normal file
@@ -0,0 +1,199 @@
|
||||
import { Hono } from 'hono';
|
||||
import { cors } from 'hono/cors';
|
||||
import { logger } from 'hono/logger';
|
||||
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';
|
||||
|
||||
const app = new Hono();
|
||||
const register = new Registry();
|
||||
|
||||
collectDefaultMetrics({ register });
|
||||
|
||||
const metrics = {
|
||||
eventsReceived: new Counter({
|
||||
name: 'log_ingest_events_total',
|
||||
help: 'Total events received',
|
||||
labelNames: ['stack_name', 'event_type'],
|
||||
registers: [register]
|
||||
}),
|
||||
|
||||
eventProcessingDuration: new Histogram({
|
||||
name: 'log_ingest_processing_duration_seconds',
|
||||
help: 'Event processing duration',
|
||||
labelNames: ['stack_name'],
|
||||
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
|
||||
registers: [register]
|
||||
}),
|
||||
|
||||
lokiPushErrors: new Counter({
|
||||
name: 'log_ingest_loki_errors_total',
|
||||
help: 'Loki push errors',
|
||||
registers: [register]
|
||||
}),
|
||||
|
||||
activeStacks: new Gauge({
|
||||
name: 'log_ingest_active_stacks',
|
||||
help: 'Number of active stacks sending events',
|
||||
registers: [register]
|
||||
})
|
||||
};
|
||||
|
||||
const LOKI_URL = process.env.LOKI_URL || 'http://loki:3100';
|
||||
|
||||
interface LogEvent {
|
||||
timestamp?: string;
|
||||
stack_name: string;
|
||||
session_id?: string;
|
||||
event_type: 'session_start' | 'session_end' | 'message' | 'tool_use' | 'error' | 'mcp_connect' | 'mcp_disconnect';
|
||||
data?: {
|
||||
role?: 'user' | 'assistant' | 'system';
|
||||
model?: string;
|
||||
agent?: string;
|
||||
tool?: string;
|
||||
tokens_in?: number;
|
||||
tokens_out?: number;
|
||||
duration_ms?: number;
|
||||
success?: boolean;
|
||||
error_code?: string;
|
||||
error_message?: string;
|
||||
content_length?: number;
|
||||
content_hash?: string;
|
||||
mcp_server?: string;
|
||||
};
|
||||
}
|
||||
|
||||
const activeStacksSet = new Set<string>();
|
||||
|
||||
async function pushToLoki(events: LogEvent[]): Promise<void> {
|
||||
const streams: Record<string, { stream: Record<string, string>; values: [string, string][] }> = {};
|
||||
|
||||
for (const event of events) {
|
||||
const labels = {
|
||||
job: 'ai-stack-events',
|
||||
stack_name: event.stack_name,
|
||||
event_type: event.event_type,
|
||||
...(event.session_id && { session_id: event.session_id }),
|
||||
...(event.data?.model && { model: event.data.model }),
|
||||
...(event.data?.agent && { agent: event.data.agent }),
|
||||
...(event.data?.tool && { tool: event.data.tool })
|
||||
};
|
||||
|
||||
const labelKey = JSON.stringify(labels);
|
||||
|
||||
if (!streams[labelKey]) {
|
||||
streams[labelKey] = {
|
||||
stream: labels,
|
||||
values: []
|
||||
};
|
||||
}
|
||||
|
||||
const timestamp = event.timestamp || new Date().toISOString();
|
||||
const nanoseconds = BigInt(new Date(timestamp).getTime()) * BigInt(1_000_000);
|
||||
|
||||
streams[labelKey].values.push([
|
||||
nanoseconds.toString(),
|
||||
JSON.stringify(event)
|
||||
]);
|
||||
}
|
||||
|
||||
const payload = {
|
||||
streams: Object.values(streams)
|
||||
};
|
||||
|
||||
const response = await fetch(`${LOKI_URL}/loki/api/v1/push`, {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify(payload)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
const text = await response.text();
|
||||
throw new Error(`Loki push failed: ${response.status} ${text}`);
|
||||
}
|
||||
}
|
||||
|
||||
app.use('*', cors());
|
||||
app.use('*', logger());
|
||||
|
||||
app.get('/health', (c) => {
|
||||
return c.json({ status: 'healthy', timestamp: new Date().toISOString() });
|
||||
});
|
||||
|
||||
app.get('/metrics', async (c) => {
|
||||
metrics.activeStacks.set(activeStacksSet.size);
|
||||
c.header('Content-Type', register.contentType);
|
||||
return c.text(await register.metrics());
|
||||
});
|
||||
|
||||
app.post('/ingest', async (c) => {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const body = await c.req.json();
|
||||
const events: LogEvent[] = Array.isArray(body) ? body : [body];
|
||||
|
||||
for (const event of events) {
|
||||
if (!event.stack_name || !event.event_type) {
|
||||
return c.json({ error: 'Missing required fields: stack_name, event_type' }, 400);
|
||||
}
|
||||
|
||||
activeStacksSet.add(event.stack_name);
|
||||
metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
|
||||
}
|
||||
|
||||
await pushToLoki(events);
|
||||
|
||||
const duration = (Date.now() - startTime) / 1000;
|
||||
for (const event of events) {
|
||||
metrics.eventProcessingDuration.observe({ stack_name: event.stack_name }, duration);
|
||||
}
|
||||
|
||||
return c.json({ success: true, count: events.length });
|
||||
} catch (error) {
|
||||
metrics.lokiPushErrors.inc();
|
||||
console.error('Ingest error:', error);
|
||||
return c.json({ error: 'Failed to process events', details: String(error) }, 500);
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/ingest/batch', async (c) => {
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
const body = await c.req.json();
|
||||
|
||||
if (!Array.isArray(body)) {
|
||||
return c.json({ error: 'Expected array of events' }, 400);
|
||||
}
|
||||
|
||||
const events: LogEvent[] = body;
|
||||
|
||||
for (const event of events) {
|
||||
if (!event.stack_name || !event.event_type) {
|
||||
continue;
|
||||
}
|
||||
activeStacksSet.add(event.stack_name);
|
||||
metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
|
||||
}
|
||||
|
||||
await pushToLoki(events);
|
||||
|
||||
const duration = (Date.now() - startTime) / 1000;
|
||||
metrics.eventProcessingDuration.observe({ stack_name: 'batch' }, duration);
|
||||
|
||||
return c.json({ success: true, count: events.length });
|
||||
} catch (error) {
|
||||
metrics.lokiPushErrors.inc();
|
||||
console.error('Batch ingest error:', error);
|
||||
return c.json({ error: 'Failed to process batch', details: String(error) }, 500);
|
||||
}
|
||||
});
|
||||
|
||||
const port = parseInt(process.env.PORT || '3000');
|
||||
console.log(`Log ingest service starting on port ${port}`);
|
||||
|
||||
export default {
|
||||
port,
|
||||
fetch: app.fetch
|
||||
};
|
||||
12
logging-stack/log-ingest/tsconfig.json
Normal file
12
logging-stack/log-ingest/tsconfig.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ESNext",
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"types": ["bun-types"],
|
||||
"esModuleInterop": true,
|
||||
"strict": true,
|
||||
"skipLibCheck": true,
|
||||
"noEmit": true
|
||||
}
|
||||
}
|
||||
@@ -388,6 +388,16 @@ export class DokployProductionClient {
|
||||
);
|
||||
}
|
||||
|
||||
async setApplicationEnv(applicationId: string, env: string): Promise<void> {
|
||||
await this.request(
|
||||
'POST',
|
||||
'/application.update',
|
||||
{ applicationId, env },
|
||||
'application',
|
||||
'set-env'
|
||||
);
|
||||
}
|
||||
|
||||
async getApplication(applicationId: string): Promise<DokployApplication> {
|
||||
return this.request<DokployApplication>(
|
||||
'GET',
|
||||
|
||||
@@ -149,6 +149,20 @@ export class DokployClient {
|
||||
} satisfies CreateDomainRequest);
|
||||
}
|
||||
|
||||
async setApplicationEnv(applicationId: string, env: string): Promise<void> {
|
||||
await this.request('POST', '/application.update', {
|
||||
applicationId,
|
||||
env
|
||||
});
|
||||
}
|
||||
|
||||
async addApplicationLabel(applicationId: string, key: string, value: string): Promise<void> {
|
||||
await this.request('POST', '/application.update', {
|
||||
applicationId,
|
||||
dockerLabels: `${key}=${value}`
|
||||
});
|
||||
}
|
||||
|
||||
async deployApplication(applicationId: string): Promise<void> {
|
||||
await this.request('POST', '/application.deploy', { applicationId });
|
||||
}
|
||||
|
||||
@@ -280,6 +280,18 @@ export class ProductionDeployer {
|
||||
registryId: config.registryId,
|
||||
});
|
||||
|
||||
state.progress = 52;
|
||||
state.message = 'Setting environment variables for logging';
|
||||
|
||||
const envVars = [
|
||||
`STACK_NAME=${config.stackName}`,
|
||||
`USAGE_LOGGING_ENABLED=true`,
|
||||
`LOG_INGEST_URL=${process.env.LOG_INGEST_URL || 'http://10.100.0.20:3102/ingest'}`,
|
||||
`METRICS_PORT=9090`,
|
||||
].join('\n');
|
||||
|
||||
await this.client.setApplicationEnv(state.resources.applicationId, envVars);
|
||||
|
||||
state.progress = 55;
|
||||
state.message = 'Creating persistent storage';
|
||||
|
||||
|
||||
Reference in New Issue
Block a user