Files
ai-stack-deployer/logging-stack/alerting/ai-stack-alerts.yml
Oussama Douhou 2f4722acd0 feat: add comprehensive logging infrastructure
- Add Loki/Prometheus/Grafana stack in logging-stack/
- Add log-ingest service for receiving events from AI stacks
- Add Grafana dashboard with stack_name filtering
- Update Dokploy client with setApplicationEnv method
- Configure STACK_NAME env var for deployed stacks
- Add alerting rules for stack health monitoring
2026-01-10 13:22:46 +01:00

63 lines
2.0 KiB
YAML

groups:
- name: ai-stack-alerts
rules:
- alert: StackUnhealthy
expr: up{job="ai-stacks"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Stack {{ $labels.stack_name }} is down"
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
- alert: HighErrorRate
expr: |
sum by (stack_name) (rate(opencode_errors_total[5m]))
/
sum by (stack_name) (rate(opencode_messages_total[5m]))
> 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "High error rate on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
- alert: NoActivity
expr: |
time() - opencode_last_activity_timestamp > 3600
for: 5m
labels:
severity: info
annotations:
summary: "No activity on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
- alert: HighTokenUsage
expr: |
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
for: 5m
labels:
severity: warning
annotations:
summary: "High token usage on {{ $labels.stack_name }}"
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
- alert: LogIngestDown
expr: up{job="log-ingest"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Log ingest service is down"
description: "The central log ingest service has been down for more than 2 minutes."
- alert: LokiDown
expr: up{job="loki"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Loki is down"
description: "Loki log aggregation service has been down for more than 2 minutes."