alertManager部署安裝、告警規則配置詳解及告警消息推送

?
java接受告警請求@RestController
@RequestMapping("/alert")
@Slf4j
public class TestApi {private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");@RequestMappingpublic void sendTemplate(HttpServletRequest request) throws Exception {String requestBody = StreamUtils.copyToString(request.getInputStream(), StandardCharsets.UTF_8);JSONObject jsonObject = JSONUtil.parseObj(requestBody);log.info("sendTemplate {}", jsonObject);// 遍歷告警信息JSONArray alerts = jsonObject.getJSONArray("alerts");for (int i = 0; i < alerts.size(); i++) {JSONObject alert = alerts.getJSONObject(i);JSONObject labels = alert.getJSONObject("labels");JSONObject annotations = alert.getJSONObject("annotations");// 替換模板中的占位符Map<String, Object> templateData = new HashMap<>();templateData.put("sendTime", LocalDateTime.now().format(FORMATTER));templateData.put("alertname", labels.getStr("alertname"));templateData.put("instance", labels.getStr("instance"));templateData.put("severity", labels.getStr("severity"));templateData.put("status", alert.getStr("status"));templateData.put("startsAt", alert.getStr("startsAt"));templateData.put("description", annotations.getStr("description"));templateData.put("generatorURL", alert.getStr("generatorURL"));String alertMsg = TemplateUtils.renderTemplate("alert.ftl", templateData);// 調用企業微信機器人發送消息WeComBot.sendToWeComBot(alertMsg);}}
}?

       <!-- FreeMarker Template Engine --><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-freemarker</artifactId></dependency>import freemarker.template.Configuration;
import freemarker.template.Template;import java.io.StringWriter;
import java.util.Map;public class TemplateUtils {private static final Configuration freemarkerConfig;// 靜態初始化 FreeMarker 配置static {freemarkerConfig = new Configuration(Configuration.VERSION_2_3_31);freemarkerConfig.setClassForTemplateLoading(TemplateUtils.class, "/templates");freemarkerConfig.setDefaultEncoding("UTF-8");}/*** 使用 FreeMarker 渲染模板** @param templateName 模板文件名（如 "alert.ftl"）* @param data         數據模型（鍵值對）* @return 渲染后的字符串*/public static String renderTemplate(String templateName, Map<String, Object> data) {try {// 加載模板Template template = freemarkerConfig.getTemplate(templateName);// 渲染模板StringWriter writer = new StringWriter();template.process(data, writer);return writer.toString();} catch (Exception e) {throw new RuntimeException("模板渲染失敗", e);}}
}public class WeComBot {private static final String WEBHOOK_URL = "https://qyapi.weixin.qq.com/cgi-binbbfc-4412c60ad031";/*** 發送消息到企業微信機器人** @param message 消息內容* @throws Exception 如果發送失敗*/public static void sendToWeComBot(String message) throws Exception {// 構造 JSON 數據String jsonPayload = JSONUtil.createObj().put("msgtype", "markdown").put("markdown", JSONUtil.createObj().put("content", message)).toString();// 發送 HTTP POST 請求HttpResponse response = HttpRequest.post(WEBHOOK_URL).header("Content-Type", "application/json; utf-8") // 設置請求頭.body(jsonPayload) // 設置請求體.timeout(5000) // 設置超時時間為 5 秒（單位：毫秒）.execute(); // 執行請求// 檢查響應狀態碼if (response.getStatus() != 200) {throw new RuntimeException("Failed to send message: HTTP error code " + response.getStatus());}}
}

? alertmanager:
? ? image: prom/alertmanager:v0.26.0
? ? environment:
? ? ? - TZ=Asia/Shanghai
? ? container_name: alertmanager
? ? volumes:
? ? ? - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
? ? ? - ./alertmanager/templates:/etc/alertmanager/templates
? ? ? - ./alertmanager/data:/alertmanager
? ? command:
? ? ? - "--config.file=/etc/alertmanager/alertmanager.yml"
? ? ? - "--storage.path=/alertmanager"
? ? ? - "--log.level=info" ? ? ? ? ? ? # 設置日志級別（可選）
? ? ports:
? ? ? - "9093:9093"
? ? ? - "9094:9094"
? ? restart: always

docker-prometheus.yamlversion: '3.8'services:prometheus:image: bitnami/prometheus:3.0.0container_name: prometheushostname: prometheusports:- "9090:9090" # Prometheus Web UI 端口volumes:- ./prometheus.yml:/etc/prometheus/prometheus.yml- ./prometheus-data:/prometheus- ./rules:/rulescommand:- '--config.file=/etc/prometheus/prometheus.yml'- '--web.external-url=http://192.168.118.20:9090/'- '--web.enable-lifecycle'- '--storage.tsdb.retention.time=90d'- "--storage.tsdb.path=/prometheus"- "--web.enable-admin-api"restart: alwaysalertmanager:image: prom/alertmanager:v0.26.0environment:- TZ=Asia/Shanghaicontainer_name: alertmanagervolumes:- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml- ./alertmanager/templates:/etc/alertmanager/templates- ./alertmanager/data:/alertmanagercommand:- "--config.file=/etc/alertmanager/alertmanager.yml"- "--storage.path=/alertmanager"- "--log.level=info"             # 設置日志級別（可選）ports:- "9093:9093"- "9094:9094"restart: alwaysgrafana:image: grafana/grafana:11.3.3container_name: grafanahostname: grafanaports:- "3000:3000" # Grafana Web UI 端口environment:GF_SECURITY_ADMIN_PASSWORD: admin # 設置 Grafana 的管理員密碼volumes:- ./grafana-storage:/var/lib/grafanarestart: alwaysnode-exporter:image: bitnami/node-exporter:1.8.1container_name: node-exporterrestart: unless-stoppedports:- "9100:9100"volumes:- /proc:/host/proc:ro- /sys:/host/sys:ro- /:/rootfs:roenvironment:IGNORE_MOUNT_POINTS: "^/(sys|proc|dev|host|etc)($$|/)"IGNORE_FS_TYPES: "^(sys|proc|auto)fs$$"command:- '--path.procfs=/host/proc'- '--path.sysfs=/host/sys'- '--path.rootfs=/rootfs'  # 修復了未閉合的引號- '--collector.filesystem.ignored-mount-points=${IGNORE_MOUNT_POINTS}'- '--collector.filesystem.ignored-fs-types=${IGNORE_FS_TYPES}'  # 修復了無效的 #{}

alertmanager.ymlglobal:resolve_timeout: 5m #表示如果告警在 5 分鐘內沒有被解決，則認為該告警已恢復route:receiver: 'default'group_by: ['instance'] #通過alertname(告警名稱)的值對告警進行分類 ;按照實例（instance）對告警進行分組group_wait: 10s  #表示第一次觸發告警時會等待 10 秒后再發送通group_interval: 20s #表示兩次告警之間的最小間隔為 20 秒; 同一組內兩次告警之間的最小間隔為 20 秒repeat_interval: 1m #如果告警持續存在，每隔 1 分鐘重復發送一次通知routes:- receiver: "hook"  #webhook通知group_wait: 10s#match:# service: "test"#severity: "critical"# match_re:#   service: "pods|critical"#   severity: "warning"# matchers:#   - service =~ "test|pods|critical"#   - severity =~ "critical|warning"- receiver: "hook1"  #郵件通知group_wait: 25s#matchers:# - severity =~ "critical|warning|info"receivers:
- name: 'hook'webhook_configs:- url: 'http://192.168.118.47:7998/alert'- name: "hook1"webhook_configs:- url: 'https://xe88-864e-8a9e7c476a18'send_resolved: true #通知已經恢復的告警- name: "default"webhook_configs:- url: 'https://x4af1-bbfc-4412c60ad031'send_resolved: true #通知已經恢復的告警- name: 'wechat'webhook_configs:- url: 'https://x-bbfc-4412c60ad031'send_resolved: trueinhibit_rules: #抑制的規則
- source_match:severity: 'critical'target_match:severity: 'warning'equal: ['alertname', 'dev', 'instance']

alerting:
? alertmanagers:
? ? - static_configs:
? ? ? ? - targets:
? ? ? ? ? ? - 192.168.118.20:9093

rule_files:
? - "/rules/*_rules.yaml"

prometheus.ymlglobal:scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.# scrape_timeout is set to the global default (10s).# Alertmanager configuration
alerting:alertmanagers:- static_configs:- targets:- 192.168.118.20:9093rule_files:- "/rules/*_rules.yaml"scrape_configs:- job_name: 'kafka'static_configs:- targets:- '192.168.118.20:9101' # 對應 kafka1 的 JMX Exporter 端口- '192.168.118.20:9102' # 對應 kafka2 的 JMX Exporter 端口- '192.168.118.20:9103' # 對應 kafka3 的 JMX Exporter 端口- job_name: "node"static_configs:- targets: ["192.168.118.20:9100"]- job_name: 'prometheus'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.47:7998']- job_name: 'prometheus1'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.148:7998']

customer_rules.yamlgroups:- name: node-alertrules:- alert: NodeDownexpr: up == 0for: 5mlabels:severity: criticalinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} down"description: "Instance: {{ $labels.instance }} 已經宕機 5分鐘"value: "{{ $value }}"- alert: NodeCpuHighexpr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu使用率過高"description: "CPU 使用率超過 80%"value: "{{ $value }}"- alert: NodeCpuIowaitHighexpr: avg by (instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 80for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu iowait 使用率過高"description: "CPU iowait 使用率超過 50%"value: "{{ $value }}"- alert: NodeLoad5Highexpr: node_load5 > (count by (instance) (node_cpu_seconds_total{mode='system'})) * 1.2for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} load(5m) 過高"description: "Load(5m) 過高，超出cpu核數 1.2倍"value: "{{ $value }}"- alert: NodeMemoryHighexpr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} memory 使用率過高"description: "Memory 使用率超過 10%"value: "{{ $value }}"- alert: NodeDiskRootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 50for: 1mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分區) 使用率過高"description: "Disk(/ 分區) 使用率超過 50%"value: "{{ $value }}"- alert: NodeDiskBootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 50for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分區) 使用率過高"description: "Disk(/boot 分區) 使用率超過 50%"value: "{{ $value }}"- alert: NodeDiskReadHighexpr: irate(node_disk_read_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 讀取字節數 速率過高"description: "Disk 讀取字節數 速率超過 20 MB/s"value: "{{ $value }}"- alert: NodeDiskWriteHighexpr: irate(node_disk_written_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 寫入字節數 速率過高"description: "Disk 寫入字節數 速率超過 20 MB/s"value: "{{ $value }}"- alert: NodeDiskReadRateCountHighexpr: irate(node_disk_reads_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒讀取速率過高"description: "Disk iops 每秒讀取速率超過 3000 iops"value: "{{ $value }}"- alert: NodeDiskWriteRateCountHighexpr: irate(node_disk_writes_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒寫入速率過高"description: "Disk iops 每秒寫入速率超過 3000 iops"value: "{{ $value }}"- alert: NodeInodeRootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分區) inode 使用率過高"description: "Disk (/ 分區) inode 使用率超過 80%"value: "{{ $value }}"- alert: NodeInodeBootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分區) inode 使用率過高"description: "Disk (/boot 分區) inode 使用率超過 80%"value: "{{ $value }}"- alert: NodeFilefdAllocatedPercentHighexpr: node_filefd_allocated / node_filefd_maximum * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} filefd 打開百分比過高"description: "Filefd 打開百分比 超過 80%"value: "{{ $value }}"- alert: NodeNetworkNetinBitRateHighexpr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 接收比特數 速率過高"description: "Network 接收比特數 速率超過 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetoutBitRateHighexpr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 發送比特數 速率過高"description: "Network 發送比特數 速率超過 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetinPacketErrorRateHighexpr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 接收錯誤包 速率過高"description: "Network 接收錯誤包 速率超過 15個/秒"value: "{{ $value }}"- alert: NodeNetworkNetoutPacketErrorRateHighexpr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 發送錯誤包 速率過高"description: "Network 發送錯誤包 速率超過 15個/秒"value: "{{ $value }}"- alert: NodeProcessBlockedHighexpr: node_procs_blocked{job="node"} > 10for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 當前被阻塞的任務的數量過多"description: "Process 當前被阻塞的任務的數量超過 10個"value: "{{ $value }}"- alert: NodeTimeOffsetHighexpr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60for: 2mlabels:severity: infoinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 時間偏差過大"description: "Time 節點的時間偏差超過 3m"value: "{{ $value }}"

https://segmentfault.com/a/1190000043690204

prometheus結合consul+confd實現動態注冊服務和動態更新配置告警規則_prometheus confd-CSDN博客

如若想動態修改下面規則內容；? ?可采用以下方案；
rule_files:
? # - "first_rules.yml"
? # - "second_rules.yml"
? - "/rules/*_rules.yaml"
?

# Download the binary
wget https://github.com/kelseyhightower/confd/releases/download/v0.16.0/confd-0.16.0-linux-amd64
?
# 重命名二進制文件，并移動到PATH的目錄下
mv confd-0.16.0-linux-amd64 /usr/local/bin/confd
chmod +x /usr/local/bin/confd
?
# 驗證是否安裝成功
confd --help

sudo mkdir -p /etc/confd/{conf.d,templates,rules}