feat: add comprehensive logging infrastructure

- Add Loki/Prometheus/Grafana stack in logging-stack/
- Add log-ingest service for receiving events from AI stacks
- Add Grafana dashboard with stack_name filtering
- Update Dokploy client with setApplicationEnv method
- Configure STACK_NAME env var for deployed stacks
- Add alerting rules for stack health monitoring
This commit is contained in:
Oussama Douhou
2026-01-10 13:22:46 +01:00
parent e617114310
commit 2f4722acd0
16 changed files with 1631 additions and 0 deletions

View File

@@ -0,0 +1,62 @@
groups:
- name: ai-stack-alerts
rules:
- alert: StackUnhealthy
expr: up{job="ai-stacks"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Stack {{ $labels.stack_name }} is down"
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
- alert: HighErrorRate
expr: |
sum by (stack_name) (rate(opencode_errors_total[5m]))
/
sum by (stack_name) (rate(opencode_messages_total[5m]))
> 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
- alert: NoActivity
expr: |
time() - opencode_last_activity_timestamp > 3600
for: 5m
labels:
severity: info
annotations:
summary: "No activity on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
- alert: HighTokenUsage
expr: |
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
for: 5m
labels:
severity: warning
annotations:
summary: "High token usage on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
- alert: LogIngestDown
expr: up{job="log-ingest"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Log ingest service is down"
description: "The central log ingest service has been down for more than 2 minutes."
- alert: LokiDown
expr: up{job="loki"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Loki is down"
description: "Loki log aggregation service has been down for more than 2 minutes."

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: 'AI Stack Dashboards'
orgId: 1
folder: 'AI Stacks'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,17 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
maxLines: 1000

View File

@@ -0,0 +1,51 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: 168h
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_streams_per_user: 10000
max_line_size: 256kb
compactor:
working_directory: /loki/compactor
shared_store: filesystem
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150

View File

@@ -0,0 +1,62 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'ai-stack-monitor'
alerting:
alertmanagers:
- static_configs:
- targets: []
rule_files:
- /etc/prometheus/alerting/*.yml
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'loki'
static_configs:
- targets: ['loki:3100']
- job_name: 'log-ingest'
static_configs:
- targets: ['log-ingest:3000']
- job_name: 'ai-stacks'
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 30s
relabel_configs:
- source_labels: [__meta_docker_container_name]
regex: '/(ai-stack-.*|app-.*opencode.*)'
action: keep
- source_labels: [__meta_docker_container_name]
regex: '/?(.*)'
target_label: container
- source_labels: [__meta_docker_port_private]
regex: '9090'
action: keep
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
target_label: service
- source_labels: [__meta_docker_container_label_stack_name]
target_label: stack_name
- source_labels: [__meta_docker_container_name]
regex: '.*opencode-([a-z0-9-]+).*'
replacement: '${1}'
target_label: stack_name
- source_labels: [__meta_docker_container_name]
regex: '.*ai-stack-([a-z0-9-]+).*'
replacement: '${1}'
target_label: stack_name
- target_label: __address__
replacement: '${1}:9090'
source_labels: [__meta_docker_container_network_ip]
- job_name: 'ai-stacks-static'
file_sd_configs:
- files:
- /etc/prometheus/targets/*.json
refresh_interval: 30s

View File

@@ -0,0 +1,71 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
target_label: 'service'
- source_labels: ['__meta_docker_container_label_com_docker_compose_project']
target_label: 'project'
- source_labels: ['__meta_docker_container_name']
regex: '/?(ai-stack-.*|app-.*opencode.*)'
action: keep
- source_labels: ['__meta_docker_container_label_stack_name']
target_label: 'stack_name'
- source_labels: ['__meta_docker_container_name']
regex: '.*opencode-([a-z0-9-]+).*'
target_label: 'stack_name'
- source_labels: ['__meta_docker_container_name']
regex: '.*ai-stack-([a-z0-9-]+).*'
target_label: 'stack_name'
pipeline_stages:
- json:
expressions:
output: log
stream: stream
timestamp: time
- labels:
stream:
- timestamp:
source: timestamp
format: RFC3339Nano
- output:
source: output
- job_name: ai-stack-events
static_configs:
- targets:
- localhost
labels:
job: ai-stack-events
__path__: /var/log/ai-stack/*.jsonl
pipeline_stages:
- json:
expressions:
stack_name: stack_name
session_id: session_id
event_type: event_type
model: data.model
agent: data.agent
tool: data.tool
- labels:
stack_name:
session_id:
event_type:
model:
agent:
tool:

View File

@@ -0,0 +1,508 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
"id": 1,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.2.0",
"targets": [
{
"expr": "count(opencode_active_sessions{stack_name=~\"$stack_name\"})",
"legendFormat": "Active Sessions",
"refId": "A"
}
],
"title": "Active Sessions",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.2.0",
"targets": [
{
"expr": "sum(increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
"legendFormat": "Total Messages",
"refId": "A"
}
],
"title": "Messages (Period)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.2.0",
"targets": [
{
"expr": "sum(increase(opencode_tokens_total{stack_name=~\"$stack_name\"}[$__range]))",
"legendFormat": "Total Tokens",
"refId": "A"
}
],
"title": "Tokens Used (Period)",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.01 },
{ "color": "red", "value": 0.05 }
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "10.2.0",
"targets": [
{
"expr": "sum(rate(opencode_errors_total{stack_name=~\"$stack_name\"}[5m])) / sum(rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Error Rate",
"type": "stat"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
"id": 5,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"targets": [
{
"expr": "sum by (stack_name) (rate(opencode_messages_total{stack_name=~\"$stack_name\"}[5m]))",
"legendFormat": "{{stack_name}}",
"refId": "A"
}
],
"title": "Messages per Second by Stack",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "auto",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
"id": 6,
"options": {
"legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "single", "sort": "none" }
},
"targets": [
{
"expr": "sum by (model) (rate(opencode_tokens_total{stack_name=~\"$stack_name\"}[5m]))",
"legendFormat": "{{model}}",
"refId": "A"
}
],
"title": "Token Usage by Model",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
}
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 0, "y": 12 },
"id": 7,
"options": {
"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
"pieType": "pie",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"tooltip": { "mode": "single", "sort": "none" }
},
"targets": [
{
"expr": "sum by (tool) (increase(opencode_tool_invocations_total{stack_name=~\"$stack_name\"}[$__range]))",
"legendFormat": "{{tool}}",
"refId": "A"
}
],
"title": "Tool Usage Distribution",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
}
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 8, "y": 12 },
"id": 8,
"options": {
"legend": { "displayMode": "list", "placement": "right", "showLegend": true },
"pieType": "pie",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"tooltip": { "mode": "single", "sort": "none" }
},
"targets": [
{
"expr": "sum by (agent) (increase(opencode_messages_total{stack_name=~\"$stack_name\"}[$__range]))",
"legendFormat": "{{agent}}",
"refId": "A"
}
],
"title": "Agent Usage Distribution",
"type": "piechart"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": {
"align": "auto",
"cellOptions": { "type": "auto" },
"inspect": false
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null }
]
}
},
"overrides": []
},
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 12 },
"id": 9,
"options": {
"cellHeight": "sm",
"footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false },
"showHeader": true
},
"pluginVersion": "10.2.0",
"targets": [
{
"expr": "topk(10, sum by (stack_name) (increase(opencode_messages_total[$__range])))",
"format": "table",
"instant": true,
"legendFormat": "",
"refId": "A"
}
],
"title": "Top 10 Active Stacks",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": { "Time": true },
"indexByName": {},
"renameByName": { "Value": "Messages", "stack_name": "Stack Name" }
}
}
],
"type": "table"
},
{
"datasource": {
"type": "loki",
"uid": "loki"
},
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 20 },
"id": 10,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"targets": [
{
"expr": "{stack_name=~\"$stack_name\"} |= ``",
"legendFormat": "",
"refId": "A"
}
],
"title": "Live Logs",
"type": "logs"
}
],
"refresh": "10s",
"schemaVersion": 38,
"tags": ["ai-stack", "monitoring"],
"templating": {
"list": [
{
"allValue": ".*",
"current": {
"selected": true,
"text": "All",
"value": "$__all"
},
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"definition": "label_values(opencode_messages_total, stack_name)",
"hide": 0,
"includeAll": true,
"label": "Stack Name",
"multi": true,
"name": "stack_name",
"options": [],
"query": {
"query": "label_values(opencode_messages_total, stack_name)",
"refId": "StandardVariableQuery"
},
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "AI Stack Overview",
"uid": "ai-stack-overview",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,138 @@
version: "3.8"
# AI Stack Logging Infrastructure
# Loki (logs) + Prometheus (metrics) + Grafana (visualization)
services:
# =============================================================================
# LOKI - Log Aggregation
# =============================================================================
loki:
image: grafana/loki:2.9.0
container_name: ai-stack-loki
ports:
- "3100:3100"
volumes:
- ./config/loki-config.yml:/etc/loki/local-config.yaml:ro
- loki-data:/loki
command: -config.file=/etc/loki/local-config.yaml
networks:
- logging-network
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3100/ready || exit 1"]
interval: 30s
timeout: 10s
retries: 3
# =============================================================================
# PROMTAIL - Log Collector (ships logs to Loki)
# =============================================================================
promtail:
image: grafana/promtail:2.9.0
container_name: ai-stack-promtail
volumes:
- ./config/promtail-config.yml:/etc/promtail/config.yml:ro
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/config.yml
networks:
- logging-network
depends_on:
- loki
restart: unless-stopped
# =============================================================================
# PROMETHEUS - Metrics Collection
# =============================================================================
prometheus:
image: prom/prometheus:v2.47.0
container_name: ai-stack-prometheus
ports:
- "9090:9090"
volumes:
- ./config/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./alerting:/etc/prometheus/alerting:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=90d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- logging-network
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# =============================================================================
# GRAFANA - Visualization & Dashboards
# =============================================================================
grafana:
image: grafana/grafana:10.2.0
container_name: ai-stack-grafana
ports:
- "3001:3000"
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3001}
- GF_INSTALL_PLUGINS=grafana-piechart-panel
volumes:
- ./config/grafana/provisioning:/etc/grafana/provisioning:ro
- ./dashboards:/var/lib/grafana/dashboards:ro
- grafana-data:/var/lib/grafana
networks:
- logging-network
depends_on:
- loki
- prometheus
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
# =============================================================================
# LOG INGEST API - Custom endpoint for AI stack events
# =============================================================================
log-ingest:
build:
context: ./log-ingest
dockerfile: Dockerfile
container_name: ai-stack-log-ingest
ports:
- "3102:3000"
environment:
- LOKI_URL=http://loki:3100
- LOG_LEVEL=info
networks:
- logging-network
depends_on:
- loki
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3000/health"]
interval: 30s
timeout: 10s
retries: 3
networks:
logging-network:
driver: bridge
name: ai-stack-logging
volumes:
loki-data:
name: ai-stack-loki-data
prometheus-data:
name: ai-stack-prometheus-data
grafana-data:
name: ai-stack-grafana-data

View File

@@ -0,0 +1,13 @@
FROM oven/bun:1.0
WORKDIR /app
COPY package.json bun.lockb* ./
RUN bun install --frozen-lockfile 2>/dev/null || bun install
COPY . .
ENV PORT=3000
EXPOSE 3000
CMD ["bun", "run", "src/index.ts"]

View File

@@ -0,0 +1,16 @@
{
"name": "ai-stack-log-ingest",
"version": "1.0.0",
"type": "module",
"scripts": {
"start": "bun run src/index.ts",
"dev": "bun --watch run src/index.ts"
},
"dependencies": {
"hono": "^4.0.0",
"prom-client": "^15.0.0"
},
"devDependencies": {
"@types/bun": "latest"
}
}

View File

@@ -0,0 +1,199 @@
import { Hono } from 'hono';
import { cors } from 'hono/cors';
import { logger } from 'hono/logger';
import { Registry, Counter, Histogram, Gauge, collectDefaultMetrics } from 'prom-client';
const app = new Hono();
const register = new Registry();
collectDefaultMetrics({ register });
const metrics = {
eventsReceived: new Counter({
name: 'log_ingest_events_total',
help: 'Total events received',
labelNames: ['stack_name', 'event_type'],
registers: [register]
}),
eventProcessingDuration: new Histogram({
name: 'log_ingest_processing_duration_seconds',
help: 'Event processing duration',
labelNames: ['stack_name'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
registers: [register]
}),
lokiPushErrors: new Counter({
name: 'log_ingest_loki_errors_total',
help: 'Loki push errors',
registers: [register]
}),
activeStacks: new Gauge({
name: 'log_ingest_active_stacks',
help: 'Number of active stacks sending events',
registers: [register]
})
};
const LOKI_URL = process.env.LOKI_URL || 'http://loki:3100';
interface LogEvent {
timestamp?: string;
stack_name: string;
session_id?: string;
event_type: 'session_start' | 'session_end' | 'message' | 'tool_use' | 'error' | 'mcp_connect' | 'mcp_disconnect';
data?: {
role?: 'user' | 'assistant' | 'system';
model?: string;
agent?: string;
tool?: string;
tokens_in?: number;
tokens_out?: number;
duration_ms?: number;
success?: boolean;
error_code?: string;
error_message?: string;
content_length?: number;
content_hash?: string;
mcp_server?: string;
};
}
const activeStacksSet = new Set<string>();
async function pushToLoki(events: LogEvent[]): Promise<void> {
const streams: Record<string, { stream: Record<string, string>; values: [string, string][] }> = {};
for (const event of events) {
const labels = {
job: 'ai-stack-events',
stack_name: event.stack_name,
event_type: event.event_type,
...(event.session_id && { session_id: event.session_id }),
...(event.data?.model && { model: event.data.model }),
...(event.data?.agent && { agent: event.data.agent }),
...(event.data?.tool && { tool: event.data.tool })
};
const labelKey = JSON.stringify(labels);
if (!streams[labelKey]) {
streams[labelKey] = {
stream: labels,
values: []
};
}
const timestamp = event.timestamp || new Date().toISOString();
const nanoseconds = BigInt(new Date(timestamp).getTime()) * BigInt(1_000_000);
streams[labelKey].values.push([
nanoseconds.toString(),
JSON.stringify(event)
]);
}
const payload = {
streams: Object.values(streams)
};
const response = await fetch(`${LOKI_URL}/loki/api/v1/push`, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(payload)
});
if (!response.ok) {
const text = await response.text();
throw new Error(`Loki push failed: ${response.status} ${text}`);
}
}
app.use('*', cors());
app.use('*', logger());
app.get('/health', (c) => {
return c.json({ status: 'healthy', timestamp: new Date().toISOString() });
});
app.get('/metrics', async (c) => {
metrics.activeStacks.set(activeStacksSet.size);
c.header('Content-Type', register.contentType);
return c.text(await register.metrics());
});
app.post('/ingest', async (c) => {
const startTime = Date.now();
try {
const body = await c.req.json();
const events: LogEvent[] = Array.isArray(body) ? body : [body];
for (const event of events) {
if (!event.stack_name || !event.event_type) {
return c.json({ error: 'Missing required fields: stack_name, event_type' }, 400);
}
activeStacksSet.add(event.stack_name);
metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
}
await pushToLoki(events);
const duration = (Date.now() - startTime) / 1000;
for (const event of events) {
metrics.eventProcessingDuration.observe({ stack_name: event.stack_name }, duration);
}
return c.json({ success: true, count: events.length });
} catch (error) {
metrics.lokiPushErrors.inc();
console.error('Ingest error:', error);
return c.json({ error: 'Failed to process events', details: String(error) }, 500);
}
});
app.post('/ingest/batch', async (c) => {
const startTime = Date.now();
try {
const body = await c.req.json();
if (!Array.isArray(body)) {
return c.json({ error: 'Expected array of events' }, 400);
}
const events: LogEvent[] = body;
for (const event of events) {
if (!event.stack_name || !event.event_type) {
continue;
}
activeStacksSet.add(event.stack_name);
metrics.eventsReceived.inc({ stack_name: event.stack_name, event_type: event.event_type });
}
await pushToLoki(events);
const duration = (Date.now() - startTime) / 1000;
metrics.eventProcessingDuration.observe({ stack_name: 'batch' }, duration);
return c.json({ success: true, count: events.length });
} catch (error) {
metrics.lokiPushErrors.inc();
console.error('Batch ingest error:', error);
return c.json({ error: 'Failed to process batch', details: String(error) }, 500);
}
});
const port = parseInt(process.env.PORT || '3000');
console.log(`Log ingest service starting on port ${port}`);
export default {
port,
fetch: app.fetch
};

View File

@@ -0,0 +1,12 @@
{
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"moduleResolution": "bundler",
"types": ["bun-types"],
"esModuleInterop": true,
"strict": true,
"skipLibCheck": true,
"noEmit": true
}
}