- Add Loki/Prometheus/Grafana stack in logging-stack/ - Add log-ingest service for receiving events from AI stacks - Add Grafana dashboard with stack_name filtering - Update Dokploy client with setApplicationEnv method - Configure STACK_NAME env var for deployed stacks - Add alerting rules for stack health monitoring
63 lines
2.0 KiB
YAML
63 lines
2.0 KiB
YAML
groups:
|
|
- name: ai-stack-alerts
|
|
rules:
|
|
- alert: StackUnhealthy
|
|
expr: up{job="ai-stacks"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Stack {{ $labels.stack_name }} is down"
|
|
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
|
|
|
|
- alert: HighErrorRate
|
|
expr: |
|
|
sum by (stack_name) (rate(opencode_errors_total[5m]))
|
|
/
|
|
sum by (stack_name) (rate(opencode_messages_total[5m]))
|
|
> 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate on {{ $labels.stack_name }}"
|
|
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
|
|
|
|
- alert: NoActivity
|
|
expr: |
|
|
time() - opencode_last_activity_timestamp > 3600
|
|
for: 5m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "No activity on {{ $labels.stack_name }}"
|
|
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
|
|
|
|
- alert: HighTokenUsage
|
|
expr: |
|
|
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High token usage on {{ $labels.stack_name }}"
|
|
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
|
|
|
|
- alert: LogIngestDown
|
|
expr: up{job="log-ingest"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Log ingest service is down"
|
|
description: "The central log ingest service has been down for more than 2 minutes."
|
|
|
|
- alert: LokiDown
|
|
expr: up{job="loki"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Loki is down"
|
|
description: "Loki log aggregation service has been down for more than 2 minutes."
|