feat: add comprehensive logging infrastructure

- Add Loki/Prometheus/Grafana stack in logging-stack/
- Add log-ingest service for receiving events from AI stacks
- Add Grafana dashboard with stack_name filtering
- Update Dokploy client with setApplicationEnv method
- Configure STACK_NAME env var for deployed stacks
- Add alerting rules for stack health monitoring
This commit is contained in:
Oussama Douhou
2026-01-10 13:22:46 +01:00
parent e617114310
commit 2f4722acd0
16 changed files with 1631 additions and 0 deletions

View File

@@ -0,0 +1,62 @@
groups:
- name: ai-stack-alerts
rules:
- alert: StackUnhealthy
expr: up{job="ai-stacks"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Stack {{ $labels.stack_name }} is down"
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
- alert: HighErrorRate
expr: |
sum by (stack_name) (rate(opencode_errors_total[5m]))
/
sum by (stack_name) (rate(opencode_messages_total[5m]))
> 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
- alert: NoActivity
expr: |
time() - opencode_last_activity_timestamp > 3600
for: 5m
labels:
severity: info
annotations:
summary: "No activity on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
- alert: HighTokenUsage
expr: |
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
for: 5m
labels:
severity: warning
annotations:
summary: "High token usage on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
- alert: LogIngestDown
expr: up{job="log-ingest"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Log ingest service is down"
description: "The central log ingest service has been down for more than 2 minutes."
- alert: LokiDown
expr: up{job="loki"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Loki is down"
description: "Loki log aggregation service has been down for more than 2 minutes."