groups: - name: ai-stack-alerts rules: - alert: StackUnhealthy expr: up{job="ai-stacks"} == 0 for: 5m labels: severity: critical annotations: summary: "Stack {{ $labels.stack_name }} is down" description: "AI Stack {{ $labels.stack_name }} has been unhealthy for more than 5 minutes." - alert: HighErrorRate expr: | sum by (stack_name) (rate(opencode_errors_total[5m])) / sum by (stack_name) (rate(opencode_messages_total[5m])) > 0.1 for: 10m labels: severity: warning annotations: summary: "High error rate on {{ $labels.stack_name }}" description: "Stack {{ $labels.stack_name }} has error rate above 10% for 10 minutes." - alert: NoActivity expr: | time() - opencode_last_activity_timestamp > 3600 for: 5m labels: severity: info annotations: summary: "No activity on {{ $labels.stack_name }}" description: "Stack {{ $labels.stack_name }} has had no activity for over 1 hour." - alert: HighTokenUsage expr: | sum by (stack_name) (increase(opencode_tokens_total[1h])) > 100000 for: 5m labels: severity: warning annotations: summary: "High token usage on {{ $labels.stack_name }}" description: "Stack {{ $labels.stack_name }} has used over 100k tokens in the last hour." - alert: LogIngestDown expr: up{job="log-ingest"} == 0 for: 2m labels: severity: critical annotations: summary: "Log ingest service is down" description: "The central log ingest service has been down for more than 2 minutes." - alert: LokiDown expr: up{job="loki"} == 0 for: 2m labels: severity: critical annotations: summary: "Loki is down" description: "Loki log aggregation service has been down for more than 2 minutes."