feat: add comprehensive logging infrastructure

- Add Loki/Prometheus/Grafana stack in logging-stack/
- Add log-ingest service for receiving events from AI stacks
- Add Grafana dashboard with stack_name filtering
- Update Dokploy client with setApplicationEnv method
- Configure STACK_NAME env var for deployed stacks
- Add alerting rules for stack health monitoring
This commit is contained in:
Oussama Douhou
2026-01-10 13:22:46 +01:00
parent e617114310
commit 2f4722acd0
16 changed files with 1631 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
apiVersion: 1
providers:
- name: 'AI Stack Dashboards'
orgId: 1
folder: 'AI Stacks'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,17 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
maxLines: 1000

View File

@@ -0,0 +1,51 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: 168h
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
max_streams_per_user: 10000
max_line_size: 256kb
compactor:
working_directory: /loki/compactor
shared_store: filesystem
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150

View File

@@ -0,0 +1,62 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'ai-stack-monitor'
alerting:
alertmanagers:
- static_configs:
- targets: []
rule_files:
- /etc/prometheus/alerting/*.yml
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'loki'
static_configs:
- targets: ['loki:3100']
- job_name: 'log-ingest'
static_configs:
- targets: ['log-ingest:3000']
- job_name: 'ai-stacks'
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 30s
relabel_configs:
- source_labels: [__meta_docker_container_name]
regex: '/(ai-stack-.*|app-.*opencode.*)'
action: keep
- source_labels: [__meta_docker_container_name]
regex: '/?(.*)'
target_label: container
- source_labels: [__meta_docker_port_private]
regex: '9090'
action: keep
- source_labels: [__meta_docker_container_label_com_docker_swarm_service_name]
target_label: service
- source_labels: [__meta_docker_container_label_stack_name]
target_label: stack_name
- source_labels: [__meta_docker_container_name]
regex: '.*opencode-([a-z0-9-]+).*'
replacement: '${1}'
target_label: stack_name
- source_labels: [__meta_docker_container_name]
regex: '.*ai-stack-([a-z0-9-]+).*'
replacement: '${1}'
target_label: stack_name
- target_label: __address__
replacement: '${1}:9090'
source_labels: [__meta_docker_container_network_ip]
- job_name: 'ai-stacks-static'
file_sd_configs:
- files:
- /etc/prometheus/targets/*.json
refresh_interval: 30s

View File

@@ -0,0 +1,71 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
- source_labels: ['__meta_docker_container_name']
regex: '/(.*)'
target_label: 'container'
- source_labels: ['__meta_docker_container_label_com_docker_swarm_service_name']
target_label: 'service'
- source_labels: ['__meta_docker_container_label_com_docker_compose_project']
target_label: 'project'
- source_labels: ['__meta_docker_container_name']
regex: '/?(ai-stack-.*|app-.*opencode.*)'
action: keep
- source_labels: ['__meta_docker_container_label_stack_name']
target_label: 'stack_name'
- source_labels: ['__meta_docker_container_name']
regex: '.*opencode-([a-z0-9-]+).*'
target_label: 'stack_name'
- source_labels: ['__meta_docker_container_name']
regex: '.*ai-stack-([a-z0-9-]+).*'
target_label: 'stack_name'
pipeline_stages:
- json:
expressions:
output: log
stream: stream
timestamp: time
- labels:
stream:
- timestamp:
source: timestamp
format: RFC3339Nano
- output:
source: output
- job_name: ai-stack-events
static_configs:
- targets:
- localhost
labels:
job: ai-stack-events
__path__: /var/log/ai-stack/*.jsonl
pipeline_stages:
- json:
expressions:
stack_name: stack_name
session_id: session_id
event_type: event_type
model: data.model
agent: data.agent
tool: data.tool
- labels:
stack_name:
session_id:
event_type:
model:
agent:
tool: