groups:
  - name: application_logs
    rules:
      # Example 1: Alert immediately on critical errors
      - alert: CriticalErrorInLogs
        # Count the number of lines with "panic" or "fatal" per minute. If > 0, then alarm.
        expr: count_over_time({job="podman-containers"} |= "panic" != "count_over_time" [1m]) > 0
        for: 0m # Fire immediately
        labels:
          severity: critical
        annotations:
          summary: "Panic detected in {{ $labels.container_name }}"
          description: "The container {{ $labels.container_name }} is logging panics/crashes."

      # Example 2: Alert on high error rate
      - alert: HighErrorRate
        # If there are more than 10 errors per minute
        # (?i) makes it case-insensitive (Error, error, ERROR)
        expr: count_over_time({job="podman-containers"} |~ "(?i)error" != "count_over_time" [1m]) > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Many errors in {{ $labels.container_name }}"
          description: "More than 10 errors per minute detected in the logs."
  - name: monitoring_log_alerts
    rules:
      # --- GENERIEKE ERROR CATCHER ---
      # Checks ALL containers in monitoring stack
      - alert: MonitoringStackErrors
        expr: |
          count_over_time({container_name=~"prometheus|loki|grafana|alertmanager|alloy"} |~ "(?i)error" [2m]) > 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Many errors in monitoring stack: {{ $labels.container_name }}"
          description: "Container {{ $labels.container_name }} is logging more than 10 errors per 2 minutes."

      # --- SPECIFIEKE LOKI ERROR ---
      - alert: LokiIngestionError
        # Check for specific op failures while writing chunks
        expr: count_over_time({container_name="loki"} |= "failed to flush" != "count_over_time" [1m]) > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Loki cannot write data"
          description: "Loki is logging 'failed to flush'. Check disk space or permissions!"

      # --- SPECIFIEKE ALLOY ERROR ---
      - alert: AlloyPipelineBlocked
        expr: count_over_time({container_name="alloy"} |= "component evaluation failed" != "count_over_time" [1m]) > 0
        labels:
          severity: warning
        annotations:
          summary: "Alloy Pipeline Error"
          description: "A component of the Alloy pipeline is not working correctly."
  - name: loki_watchdog
    rules:
      # 1. The Standard Watchdog
      # This alert is ALWAYS 'Firing'.
      # Purpose: Prove that Loki Ruler is running and can talk to Alertmanager.
      - alert: LokiWatchdog
        expr: vector(1)
        labels:
          severity: none
        annotations:
          summary: "Loki Watchdog (Heartbeat)"
          description: "This alert must always be visible. If it's missing, the Loki alerting pipeline is not working anymore."

      # 2. Ingestion Watchdog
      # This alert fires if NO logs come in at all.
      # This is different from the Watchdog above; this tests if Alloy/Promtail still sends data.
      - alert: LokiNoLogsIngested
        # Checks if the total rate of logs (across all jobs) is 0.
        expr: sum(rate({job=~".+"}[2m])) == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Loki is not receiving logs anymore"
          description: "No logs have been received in Loki for 5 minutes. Check Alloy or Disk Space."