groups:
    # ---------------------------------------------------------
    # Fedora Workstation Alerts
    # ---------------------------------------------------------
  - name: fedora_workstation_alerts
    rules:
      # 1. Is the machine/exporter reachable?
      - alert: InstanceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Target {{ $labels.job }} ({{ $labels.instance }}) is down"
          description: "The service {{ $labels.job }} has been unreachable for Prometheus for more than 1 minute."

      # 2. Warning when Disk space < 10%
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Disk almost full ({{ $labels.instance }})"
          description: "Only {{ $value | humanize }}% space left on partition {{ $labels.mountpoint }}."

      # 3. High CPU load (if load is higher than number of cores * 1.5)
      - alert: HostHighCpuLoad
        expr: node_load1 > (count without (cpu) (node_cpu_seconds_total{mode="idle"})) * 1.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU Load ({{ $labels.instance }})"
          description: "CPU load has been extremely high for 5 minutes."

      # 4. Memory full (< 10% available)
      - alert: HostOutOfMemory
        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Memory almost full ({{ $labels.instance }})"
          description: "RAM memory is almost out (< 10% free)."

      # 5. Memory pressure (high page faults)
      - alert: HostMemoryUnderMemoryPressure
        expr: (rate(node_vmstat_pgmajfault[5m]) > 1000)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host memory under memory pressure (instance {{ $labels.instance }})
          description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 6. You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
      - alert: HostMemoryIsUnderutilized
        expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8
        for: 0m
        labels:
          severity: info
        annotations:
          summary: Host Memory is underutilized (instance {{ $labels.instance }})
          description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 7. High outgoing network throughput (>80% of network speed)
      - alert: HostUnusualNetworkThroughputIn
        expr: ((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host unusual network throughput in (instance {{ $labels.instance }})
          description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 8. High incoming network throughput (>80% of network speed)
      - alert: HostUnusualNetworkThroughputOut
        expr: ((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host unusual network throughput out (instance {{ $labels.instance }})
          description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 9. High disk write speed (>80% of disk speed)
      - alert: HostUnusualDiskReadRate
        expr: (rate(node_disk_io_time_seconds_total[5m]) > .80)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk read rate (instance {{ $labels.instance }})
          description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 10. Critical warning if Disk space < 10%
      # Please add ignored mountpoints in node_exporter parameters like
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
      - alert: HostOutOfDiskSpace
        expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Host out of disk space (instance {{ $labels.instance }})
          description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 11. Critical warning if Disk space is running out within 24 hours
      # Please add ignored mountpoints in node_exporter parameters like
      # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
      # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
      - alert: HostDiskMayFillIn24Hours
        expr: predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
          description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 12. Critical warning if inodes are running out (< 10% left)
      - alert: HostOutOfInodes
        expr: (node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Host out of inodes (instance {{ $labels.instance }})
          description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 13. Critical warning if a device error has occurred
      - alert: HostFilesystemDeviceError
        expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: Host filesystem device error (instance {{ $labels.instance }})
          description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 14. Critical warning if inodes may fill within 24 hours
      - alert: HostInodesMayFillIn24Hours
        expr: predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
          description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 15. Warning for unusual disk write latency (> 100ms)
      - alert: HostUnusualDiskReadLatency
        expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk read latency (instance {{ $labels.instance }})
          description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 16. Warning for unusual disk write latency (> 100ms)
      - alert: HostUnusualDiskWriteLatency
        expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk write latency (instance {{ $labels.instance }})
          description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 17. Warning for high CPU load (> 80% used)
      - alert: HostHighCpuLoad
        expr: 1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Host high CPU load (instance {{ $labels.instance }})
          description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 18. You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
      # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
      - alert: HostCpuIsUnderutilized
        expr: (min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8
        for: 1w
        labels:
          severity: info
        annotations:
          summary: Host CPU is underutilized (instance {{ $labels.instance }})
          description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 19. Warning for high CPU steal (> 10%)
      - alert: HostCpuStealNoisyNeighbor
        expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 20. Warning for high CPU iowait (> 10%)
      - alert: HostCpuHighIowait
        expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host CPU high iowait (instance {{ $labels.instance }})
          description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 21. Warning for unusual disk IO (> 80% busy)
      - alert: HostUnusualDiskIo
        expr: rate(node_disk_io_time_seconds_total[5m]) > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk IO (instance {{ $labels.instance }})
          description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 22. Warning for high context switching (> 2x daily average)
      # x2 context switches is an arbitrary number.
      # The alert threshold depends on the nature of the application.
      # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
      - alert: HostContextSwitchingHigh
        expr: (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host context switching high (instance {{ $labels.instance }})
          description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 23. Warning if swap space is using more than 80%
      - alert: HostSwapIsFillingUp
        expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host swap is filling up (instance {{ $labels.instance }})
          description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 24. Warning if a systemd service has crashed
      - alert: HostSystemdServiceCrashed
        expr: (node_systemd_unit_state{state="failed"} == 1)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host systemd service crashed (instance {{ $labels.instance }})
          description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 25. Warning if a physical component is too hot
      - alert: HostPhysicalComponentTooHot
        expr: node_hwmon_temp_celsius > node_hwmon_temp_max_celsius
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host physical component too hot (instance {{ $labels.instance }})
          description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 26. Critical warning if an overtemperature alarm has been activated
      - alert: HostNodeOvertemperatureAlarm
        expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Host node overtemperature alarm (instance {{ $labels.instance }})
          description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 27. Critical warning if software RAID has insufficient drives
      - alert: HostSoftwareRaidInsufficientDrives
        expr: ((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
          description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 28. Warning if a software RAID disk has failed
      - alert: HostSoftwareRaidDiskFailure
        expr: (node_md_disks{state="failed"} > 0)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host software RAID disk failure (instance {{ $labels.instance }})
          description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 29. Info alert if the kernel version has changed
      - alert: HostKernelVersionDeviations
        expr: changes(node_uname_info[1h]) > 0
        for: 0m
        labels:
          severity: info
        annotations:
          summary: Host kernel version deviations (instance {{ $labels.instance }})
          description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 30. Warning if an OOM kill has occurred
      - alert: HostOomKillDetected
        expr: (increase(node_vmstat_oom_kill[1m]) > 0)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host OOM kill detected (instance {{ $labels.instance }})
          description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 31. Info alert if correctable EDAC errors have been detected
      - alert: HostEdacCorrectableErrorsDetected
        expr: (increase(node_edac_correctable_errors_total[1m]) > 0)
        for: 0m
        labels:
          severity: info
        annotations:
          summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 32. Warning if uncorrectable EDAC errors have been detected
      - alert: HostEdacUncorrectableErrorsDetected
        expr: (node_edac_uncorrectable_errors_total > 0)
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
          description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 33. Warning for high host network transmit errors (> 1% errors)
      - alert: HostNetworkReceiveErrors
        expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host Network Receive Errors (instance {{ $labels.instance }})
          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 34. Warning for high host network receive errors (> 1% errors)
      - alert: HostNetworkTransmitErrors
        expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host Network Transmit Errors (instance {{ $labels.instance }})
          description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 35. Warning if a network bond is degraded
      - alert: HostNetworkBondDegraded
        expr: ((node_bonding_active - node_bonding_slaves) != 0)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host Network Bond Degraded (instance {{ $labels.instance }})
          description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 36. Warning if conntrack limit has almost been reached (> 80%)
      - alert: HostConntrackLimit
        expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host conntrack limit (instance {{ $labels.instance }})
          description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 37. Warning if host clock skew has been detected (> 50ms deviation)
      - alert: HostClockSkew
        expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: Host clock skew (instance {{ $labels.instance }})
          description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      # 38. Warning if host clock is not synchronizing (max error >=16s)
      - alert: HostClockNotSynchronising
        expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host clock not synchronising (instance {{ $labels.instance }})
          description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # ---------------------------------------------------------
  # Podman Exporter Alerts
  # ---------------------------------------------------------
  - name: podman_exporter_alerts
    rules:
      # 39. Alarm if a container stops (except if you stop it yourself,
        # this looks at containers that are 'not running' but do exist).
        # The podman-exporter uses integers for status:
        # -1=unknown, 0=created, 1=initialized, 2=running,
        # 3=stopped, 4=paused, 5=exited, 6=removing, 7=stopping
      - alert: PodmanContainerDown
        # We take the state (!= 2) and multiply it with the info metric (which always has value 1).
        # We 'join' them based on 'id' and 'instance' and pull the 'name' label into it.
        expr: (podman_container_state != 2) * on(id, instance) group_left(name) podman_container_info{name!="minio-init"}
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} is not running"
          description: "Container {{ $labels.name }} has status code {{ $value }} (not running)."

      # 40. Podman Exporter down alert
      - alert: PodmanExporterDown
        expr: up{job="podman-exporter"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Podman Exporter down"
          description: "Cannot retrieve metrics from the Podman socket."

      # 41. Container Crashes & Status
      - alert: PodmanContainerStopped
        # Alarm if a container has crashed or gave an error (status 5 = exited, exit_code != 0).
        # We use podman_container_exit_code as basis, so $value is the actual exit_code number.
        expr: >
          (
            podman_container_exit_code != 0
            and on(id, instance)
            podman_container_state == 5
          ) * on(id, instance) group_left(name, image) podman_container_info
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} has crashed"
          description: "Container {{ $labels.name }} (Image: {{ $labels.image }}) stopped with exit code {{ $value }}."

      # 42. Container Restart Loop
      - alert: PodmanContainerRestartLoop
        # Alarm if a container restarts more than 2x in 5 minutes.
        # The changes() function counts how many times the start_time timestamp changes.
        expr: >
          (
            changes(podman_container_started_seconds[5m]) > 2
          ) * on(id, instance) group_left(name, image) podman_container_info
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} keeps restarting"
          description: "Possible CrashLoop: Container {{ $labels.name }} (Image: {{ $labels.image }}) restarted {{ $value }} times in the last 5 minutes."

      # 43. Container Healthcheck
      - alert: PodmanContainerUnhealthy
        # Only relevant if your containers have a HEALTHCHECK defined in their Dockerfile/Containerfile
        # health_status: 0=healthy, 1=unhealthy, 2=starting
        expr: (podman_container_health == 1) * on(id, instance) group_left(name, image) podman_container_info
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} is unhealthy"
          description: "The healthcheck of container {{ $labels.name }} (Image: {{ $labels.image }}) is failing."

      # 44. High CPU or Memory usage
      - alert: PodmanContainerHighCpu
        # Alarm if a container uses more than 80% of 1 CPU core (adjust threshold as needed)
        # We normalize this over 5 minutes to ignore spikes.
        expr: rate(podman_container_cpu_usage_seconds_total[5m]) * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage: {{ $labels.name }}"
          description: "Container {{ $labels.name }} is using high CPU ({{ $value }}%)."

      # 45. High Memory usage
      - alert: PodmanContainerHighMemory
        # Alarm if a container uses more than 1GB RAM (adjust threshold as needed)
        # Podman exporter returns bytes. 1e9 = 1GB.
        expr: podman_container_memory_usage_bytes > 1e9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Memory usage: {{ $labels.name }}"
          description: "Container {{ $labels.name }} is using more than 1GB RAM ({{ $value | humanize1024 }})."

      # 46. OOM Kills
      - alert: PodmanContainerOOMKilled
        # Works only if podman-exporter has access to OOM events
        expr: increase(podman_container_oom_killed_count[5m]) > 0
        labels:
          severity: critical
        annotations:
          summary: "Container {{ $labels.name }} killed by OOM"
          description: "The container was killed by the system due to out of memory (Out Of Memory)."

  # ---------------------------------------------------------
  # Watchdog Alert
  # ---------------------------------------------------------
  - name: watchdog_alerts
    rules:
      - alert: Watchdog
        expr: vector(1)
        labels:
          severity: none
        annotations:
          summary: "Prometheus is working and sending alerts to Alertmanager"
          description: "This is a heartbeat alert that is always 'firing'."

  # ---------------------------------------------------------
  # Alertmanager Alerts
  # ---------------------------------------------------------
  - name: alertmanager_alerts
    rules:
      # --- ALERTMANAGER ---
      - alert: AlertmanagerConfigInconsistent
        expr: count(alertmanager_config_hash) by (cluster) > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Alertmanager configuration inconsistent"
          description: "Different alertmanagers have different configs."

      - alert: AlertmanagerFailedReload
        expr: alertmanager_config_last_reload_successful == 0
        labels:
          severity: warning
        annotations:
          summary: "Alertmanager reload failed"
          description: "The latest configuration change could not be loaded."

  # ---------------------------------------------------------
  # Loki Alerts
  # ---------------------------------------------------------
  - name: loki_alerts
    rules:
      # --- LOKI (Because Loki sometimes drops metrics under heavy load) ---
      - alert: LokiRequestErrors
        # Check for 5xx errors in the Loki API
        expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Loki gives many errors"
          description: "More than 10% of requests to Loki fail."

      - alert: LokiPanics
        expr: sum(increase(loki_panic_total[10m])) > 0
        labels:
          severity: critical
        annotations:
          summary: "Loki has crashed (Panic)"
          description: "Loki has registered a panic in the logs/metrics."

      - alert: LokiProcessTooManyRestarts
        expr: changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Loki process too many restarts (instance {{ $labels.instance }})
          description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: LokiRequestErrors
        expr: 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10
        for: 15m
        labels:
          severity: critical
        annotations:
          summary: Loki request errors (instance {{ $labels.instance }})
          description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: LokiRequestPanic
        expr: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: Loki request panic (instance {{ $labels.instance }})
          description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: LokiRequestLatency
        expr: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: Loki request latency (instance {{ $labels.instance }})
          description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # ---------------------------------------------------------
  # Grafana Alerts
  # ---------------------------------------------------------
  - name: grafana_alerts
    rules:
      - alert: GrafanaDown
        expr: up{job="grafana"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Grafana is offline"
          description: "The Grafana dashboard cannot be reached by Prometheus."

  # ---------------------------------------------------------
  # KeepHQ Alerts
  # ---------------------------------------------------------
  - name: keephq_alerts
    rules:
      # Alerts when KeepHQ API reacts slow
      - alert: KeepHQApiSlow
        expr: rate(keep_http_request_duration_seconds_sum[5m]) / rate(keep_http_request_duration_seconds_count[5m]) > 2
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "KeepHQ is slow"
          description: "Average response time of KeepHQ is greater than 2 seconds."

  # ---------------------------------------------------------
  # Karma Alerts
  # ---------------------------------------------------------
  - name: karma_alerts
    rules:
      - alert: KarmaDashboardDown
        expr: up{job="karma"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Karma is offline"
          description: "The Karma dashboard cannot be reached by Prometheus."

  # ---------------------------------------------------------
  # Alloy Alerts
  # ---------------------------------------------------------
  - name: alloy_alerts
    rules:
      # --- ALLOY ---
      - alert: AlloyUnhealthy
        # This alert assumes that the alloy_component_controller_running_components metric has a label 'health_type' that can be 'healthy' or 'unhealthy'.
        expr: alloy_component_controller_running_components{health_type="unhealthy"} > 0
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Alloy component unhealthy (component {{ $labels.component_id }})"
          description: "Component {{ $labels.component_id }} in Alloy reports a error."

      - alert: GrafanaAlloyServiceDown
        expr: count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Grafana Alloy service down (instance {{ $labels.instance }})
          description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # ---------------------------------------------------------
  # Blackbox Exporter Alerts
  # ---------------------------------------------------------
  - name: blackbox_alerts
    rules:
      - alert: ServiceHealthCheckFailed
        # probe_success is 1 (success) or 0 (failed)
        expr: probe_success == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Health Check Failed: {{ $labels.instance }}"
          description: "The HTTP probe to {{ $labels.instance }} failed. Service might be down or responding slowly."

      - alert: ServiceSlowResponse
        # Warning if response is slower than 1 second
        expr: probe_duration_seconds > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow response: {{ $labels.instance }}"
          description: "The HTTP probe to {{ $labels.instance }} takes longer than 1 second (current value: {{ $value | humanizeDuration }})."

  # ---------------------------------------------------------
  # OpenTelemetry Collector Alerts
  # ---------------------------------------------------------
  - name: opentelemetrycollector_alerts
    rules:
      - alert: OpentelemetryCollectorDown
        expr: up{job=~".*otel.*collector.*"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorReceiverRefusedSpans
        expr: rate(otelcol_receiver_refused_spans[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorReceiverRefusedMetricPoints
        expr: rate(otelcol_receiver_refused_metric_points[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorReceiverRefusedLogRecords
        expr: rate(otelcol_receiver_refused_log_records[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorExporterFailedSpans
        expr: rate(otelcol_exporter_send_failed_spans[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorExporterFailedMetricPoints
        expr: rate(otelcol_exporter_send_failed_metric_points[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorExporterFailedLogRecords
        expr: rate(otelcol_exporter_send_failed_log_records[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorExporterQueueNearlyFull
        expr: (otelcol_exporter_queue_size / otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorProcessorRefusedSpans
        expr: rate(otelcol_processor_refused_spans[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorProcessorRefusedMetricPoints
        expr: rate(otelcol_processor_refused_metric_points[5m]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorHighMemoryUsage
        expr: (otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector memory usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

      - alert: OpentelemetryCollectorOtlpReceiverErrors
        expr: rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
          description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # ---------------------------------------------------------
  # MinIO Alerts
  # ---------------------------------------------------------
  - name: minio_alerts
    rules:
      - alert: MinioClusterOffline
        expr: minio_cluster_nodes_offline_total > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "MinIO Node Offline"
          description: "A node in the MinIO cluster is offline."

      - alert: MinioDriveOffline
        expr: minio_cluster_drive_offline_total > 0
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "MinIO Drive Offline"
          description: "A drive in MinIO is not available."

  # ---------------------------------------------------------
  # Tempo Alerts
  # ---------------------------------------------------------
  - name: tempo_alerts
    rules:
      - alert: HighSpanLatency
        # Warning if 95e percentiel latency > 2s
        expr: histogram_quantile(0.95, sum(rate(traces_spanmetrics_latency_bucket[5m])) by (le, service)) > 2
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "Service {{ $labels.service }} is slow."
          description: "The 95th percentile latency of traces is higher than 2s."

  # ---------------------------------------------------------
  # Traefik Alerts
  # ---------------------------------------------------------
  - name: traefik_alerts
    rules:
      - alert: TraefikDown
        expr: up{job="traefik"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Traefik is offline"
          description: "Traefik reverse proxy does not respond."

      - alert: TraefikHighHttp5xxErrorRate
        # Warning if more than 5% of requests result in 5xx errors.
        expr: sum(rate(traefik_entrypoint_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(traefik_entrypoint_request_duration_seconds_count[5m])) * 100 > 5
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Many 5xx errors in Traefik"
          description: "More than 5% of all incoming requests via Traefik result in a 5xx server error. Check the backend containers!"

  # ---------------------------------------------------------
  # Traefik Alerts
  # ---------------------------------------------------------
  - name: pyroscope_alerts
    rules:
      # 1. Container Health: High CPU
      - alert: PyroscopeHighCpuUsage
        expr: rate(process_cpu_seconds_total{job=~".*pyroscope.*"}[2m]) > 2.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU load on Pyroscope"
          description: "Pyroscope (instance {{ $labels.instance }}) has been using more than 2 CPU cores for 5 minutes."

      # 2. Container Health: High Memory Usage (OOM risk)
      - alert: PyroscopeHighMemoryUsage
        # Alerts if RSS memory exceeds 2 GB
        expr: process_resident_memory_bytes{job=~".*pyroscope.*"} > 2147483648
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Pyroscope memory usage is critically high"
          description: "Pyroscope (instance {{ $labels.instance }}) is using over 2GB of memory. Risk of an Out-Of-Memory (OOM) kill."

      # 3. Container Health: Possible Memory Leak or Goroutine Leak
      - alert: PyroscopeHighGoroutineCount
        expr: go_goroutines{job=~".*pyroscope.*"} > 10000
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Pyroscope has an unusually high number of goroutines"
          description: "Pyroscope (instance {{ $labels.instance }}) has more than 10,000 active goroutines, which may indicate a leak."

      # 4. Container Health: Frequent Restarts
      - alert: PyroscopeFrequentRestarts
        # Alerts if uptime is less than 5 minutes
        expr: (time() - process_start_time_seconds{job=~".*pyroscope.*"}) < 300
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Pyroscope has recently restarted"
          description: "The Pyroscope container on {{ $labels.instance }} has just started (uptime < 5m)."

      # 5. Database Performance: Slow S3/MinIO connection
      - alert: PyroscopeSlowBlockOpening
        # Alerts if opening a block of data takes longer than 1 second on average
        expr: (sum by (instance) (rate(pyroscopedb_block_opening_duration_sum{job=~".*pyroscope.*"}[5m])) / sum by (instance) (rate(pyroscopedb_block_opening_duration_count{job=~".*pyroscope.*"}[5m]))) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Slow loading of Pyroscope storage blocks"
          description: "Opening profile blocks in the backend storage (MinIO) takes longer than 1 second on average on {{ $labels.instance }}. This will make the Explore view slow."

      # 6. Cluster Performance: Gossip network slow
      - alert: PyroscopeHighGossipLatency
        # Alerts if internal cluster communication takes longer than 500ms
        expr: timer_memberlist_gossip{job=~".*pyroscope.*", quantile="0.5"} > 0.5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network latency in Pyroscope cluster"
          description: "The median memberlist gossip latency on {{ $labels.instance }} is greater than 0.5 seconds."