feat: add comprehensive logging infrastructure
- Add Loki/Prometheus/Grafana stack in logging-stack/ - Add log-ingest service for receiving events from AI stacks - Add Grafana dashboard with stack_name filtering - Update Dokploy client with setApplicationEnv method - Configure STACK_NAME env var for deployed stacks - Add alerting rules for stack health monitoring
This commit is contained in:
62
logging-stack/alerting/ai-stack-alerts.yml
Normal file
62
logging-stack/alerting/ai-stack-alerts.yml
Normal file
@@ -0,0 +1,62 @@
|
||||
groups:
|
||||
- name: ai-stack-alerts
|
||||
rules:
|
||||
- alert: StackUnhealthy
|
||||
expr: up{job="ai-stacks"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Stack {{ $labels.stack_name }} is down"
|
||||
description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes."
|
||||
|
||||
- alert: HighErrorRate
|
||||
expr: |
|
||||
sum by (stack_name) (rate(opencode_errors_total[5m]))
|
||||
/
|
||||
sum by (stack_name) (rate(opencode_messages_total[5m]))
|
||||
> 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes."
|
||||
|
||||
- alert: NoActivity
|
||||
expr: |
|
||||
time() - opencode_last_activity_timestamp > 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "No activity on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour."
|
||||
|
||||
- alert: HighTokenUsage
|
||||
expr: |
|
||||
sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High token usage on {{ $labels.stack_name }}"
|
||||
description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour."
|
||||
|
||||
- alert: LogIngestDown
|
||||
expr: up{job="log-ingest"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Log ingest service is down"
|
||||
description: "The central log ingest service has been down for more than 2 minutes."
|
||||
|
||||
- alert: LokiDown
|
||||
expr: up{job="loki"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Loki is down"
|
||||
description: "Loki log aggregation service has been down for more than 2 minutes."
|
||||
Reference in New Issue
Block a user