?
? java接受告警請求@RestController @RequestMapping("/alert") @Slf4j public class TestApi {private static final DateTimeFormatter FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");@RequestMappingpublic void sendTemplate(HttpServletRequest request) throws Exception {String requestBody = StreamUtils.copyToString(request.getInputStream(), StandardCharsets.UTF_8);JSONObject jsonObject = JSONUtil.parseObj(requestBody);log.info("sendTemplate {}", jsonObject);// 遍歷告警信息JSONArray alerts = jsonObject.getJSONArray("alerts");for (int i = 0; i < alerts.size(); i++) {JSONObject alert = alerts.getJSONObject(i);JSONObject labels = alert.getJSONObject("labels");JSONObject annotations = alert.getJSONObject("annotations");// 替換模板中的占位符Map<String, Object> templateData = new HashMap<>();templateData.put("sendTime", LocalDateTime.now().format(FORMATTER));templateData.put("alertname", labels.getStr("alertname"));templateData.put("instance", labels.getStr("instance"));templateData.put("severity", labels.getStr("severity"));templateData.put("status", alert.getStr("status"));templateData.put("startsAt", alert.getStr("startsAt"));templateData.put("description", annotations.getStr("description"));templateData.put("generatorURL", alert.getStr("generatorURL"));String alertMsg = TemplateUtils.renderTemplate("alert.ftl", templateData);// 調用企業微信機器人發送消息WeComBot.sendToWeComBot(alertMsg);}} }?
<!-- FreeMarker Template Engine --><dependency><groupId>org.springframework.boot</groupId><artifactId>spring-boot-starter-freemarker</artifactId></dependency>import freemarker.template.Configuration; import freemarker.template.Template;import java.io.StringWriter; import java.util.Map;public class TemplateUtils {private static final Configuration freemarkerConfig;// 靜態初始化 FreeMarker 配置static {freemarkerConfig = new Configuration(Configuration.VERSION_2_3_31);freemarkerConfig.setClassForTemplateLoading(TemplateUtils.class, "/templates");freemarkerConfig.setDefaultEncoding("UTF-8");}/*** 使用 FreeMarker 渲染模板** @param templateName 模板文件名(如 "alert.ftl")* @param data 數據模型(鍵值對)* @return 渲染后的字符串*/public static String renderTemplate(String templateName, Map<String, Object> data) {try {// 加載模板Template template = freemarkerConfig.getTemplate(templateName);// 渲染模板StringWriter writer = new StringWriter();template.process(data, writer);return writer.toString();} catch (Exception e) {throw new RuntimeException("模板渲染失敗", e);}} }public class WeComBot {private static final String WEBHOOK_URL = "https://qyapi.weixin.qq.com/cgi-binbbfc-4412c60ad031";/*** 發送消息到企業微信機器人** @param message 消息內容* @throws Exception 如果發送失敗*/public static void sendToWeComBot(String message) throws Exception {// 構造 JSON 數據String jsonPayload = JSONUtil.createObj().put("msgtype", "markdown").put("markdown", JSONUtil.createObj().put("content", message)).toString();// 發送 HTTP POST 請求HttpResponse response = HttpRequest.post(WEBHOOK_URL).header("Content-Type", "application/json; utf-8") // 設置請求頭.body(jsonPayload) // 設置請求體.timeout(5000) // 設置超時時間為 5 秒(單位:毫秒).execute(); // 執行請求// 檢查響應狀態碼if (response.getStatus() != 200) {throw new RuntimeException("Failed to send message: HTTP error code " + response.getStatus());}} }
? alertmanager:
? ? image: prom/alertmanager:v0.26.0
? ? environment:
? ? ? - TZ=Asia/Shanghai
? ? container_name: alertmanager
? ? volumes:
? ? ? - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
? ? ? - ./alertmanager/templates:/etc/alertmanager/templates
? ? ? - ./alertmanager/data:/alertmanager
? ? command:
? ? ? - "--config.file=/etc/alertmanager/alertmanager.yml"
? ? ? - "--storage.path=/alertmanager"
? ? ? - "--log.level=info" ? ? ? ? ? ? # 設置日志級別(可選)
? ? ports:
? ? ? - "9093:9093"
? ? ? - "9094:9094"
? ? restart: always
docker-prometheus.yamlversion: '3.8'services:prometheus:image: bitnami/prometheus:3.0.0container_name: prometheushostname: prometheusports:- "9090:9090" # Prometheus Web UI 端口volumes:- ./prometheus.yml:/etc/prometheus/prometheus.yml- ./prometheus-data:/prometheus- ./rules:/rulescommand:- '--config.file=/etc/prometheus/prometheus.yml'- '--web.external-url=http://192.168.118.20:9090/'- '--web.enable-lifecycle'- '--storage.tsdb.retention.time=90d'- "--storage.tsdb.path=/prometheus"- "--web.enable-admin-api"restart: alwaysalertmanager:image: prom/alertmanager:v0.26.0environment:- TZ=Asia/Shanghaicontainer_name: alertmanagervolumes:- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml- ./alertmanager/templates:/etc/alertmanager/templates- ./alertmanager/data:/alertmanagercommand:- "--config.file=/etc/alertmanager/alertmanager.yml"- "--storage.path=/alertmanager"- "--log.level=info" # 設置日志級別(可選)ports:- "9093:9093"- "9094:9094"restart: alwaysgrafana:image: grafana/grafana:11.3.3container_name: grafanahostname: grafanaports:- "3000:3000" # Grafana Web UI 端口environment:GF_SECURITY_ADMIN_PASSWORD: admin # 設置 Grafana 的管理員密碼volumes:- ./grafana-storage:/var/lib/grafanarestart: alwaysnode-exporter:image: bitnami/node-exporter:1.8.1container_name: node-exporterrestart: unless-stoppedports:- "9100:9100"volumes:- /proc:/host/proc:ro- /sys:/host/sys:ro- /:/rootfs:roenvironment:IGNORE_MOUNT_POINTS: "^/(sys|proc|dev|host|etc)($$|/)"IGNORE_FS_TYPES: "^(sys|proc|auto)fs$$"command:- '--path.procfs=/host/proc'- '--path.sysfs=/host/sys'- '--path.rootfs=/rootfs' # 修復了未閉合的引號- '--collector.filesystem.ignored-mount-points=${IGNORE_MOUNT_POINTS}'- '--collector.filesystem.ignored-fs-types=${IGNORE_FS_TYPES}' # 修復了無效的 #{}
alertmanager.ymlglobal:resolve_timeout: 5m #表示如果告警在 5 分鐘內沒有被解決,則認為該告警已恢復route:receiver: 'default'group_by: ['instance'] #通過alertname(告警名稱)的值對告警進行分類 ;按照實例(instance)對告警進行分組group_wait: 10s #表示第一次觸發告警時會等待 10 秒后再發送通group_interval: 20s #表示兩次告警之間的最小間隔為 20 秒; 同一組內兩次告警之間的最小間隔為 20 秒repeat_interval: 1m #如果告警持續存在,每隔 1 分鐘重復發送一次通知routes:- receiver: "hook" #webhook通知group_wait: 10s#match:# service: "test"#severity: "critical"# match_re:# service: "pods|critical"# severity: "warning"# matchers:# - service =~ "test|pods|critical"# - severity =~ "critical|warning"- receiver: "hook1" #郵件通知group_wait: 25s#matchers:# - severity =~ "critical|warning|info"receivers: - name: 'hook'webhook_configs:- url: 'http://192.168.118.47:7998/alert'- name: "hook1"webhook_configs:- url: 'https://xe88-864e-8a9e7c476a18'send_resolved: true #通知已經恢復的告警- name: "default"webhook_configs:- url: 'https://x4af1-bbfc-4412c60ad031'send_resolved: true #通知已經恢復的告警- name: 'wechat'webhook_configs:- url: 'https://x-bbfc-4412c60ad031'send_resolved: trueinhibit_rules: #抑制的規則 - source_match:severity: 'critical'target_match:severity: 'warning'equal: ['alertname', 'dev', 'instance']
alerting:
? alertmanagers:
? ? - static_configs:
? ? ? ? - targets:
? ? ? ? ? ? - 192.168.118.20:9093
rule_files:
? - "/rules/*_rules.yaml"
prometheus.ymlglobal:scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.# scrape_timeout is set to the global default (10s).# Alertmanager configuration alerting:alertmanagers:- static_configs:- targets:- 192.168.118.20:9093rule_files:- "/rules/*_rules.yaml"scrape_configs:- job_name: 'kafka'static_configs:- targets:- '192.168.118.20:9101' # 對應 kafka1 的 JMX Exporter 端口- '192.168.118.20:9102' # 對應 kafka2 的 JMX Exporter 端口- '192.168.118.20:9103' # 對應 kafka3 的 JMX Exporter 端口- job_name: "node"static_configs:- targets: ["192.168.118.20:9100"]- job_name: 'prometheus'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.47:7998']- job_name: 'prometheus1'metrics_path: /actuator/prometheusstatic_configs:- targets: ['192.168.118.148:7998']
customer_rules.yamlgroups:- name: node-alertrules:- alert: NodeDownexpr: up == 0for: 5mlabels:severity: criticalinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} down"description: "Instance: {{ $labels.instance }} 已經宕機 5分鐘"value: "{{ $value }}"- alert: NodeCpuHighexpr: (1 - avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu使用率過高"description: "CPU 使用率超過 80%"value: "{{ $value }}"- alert: NodeCpuIowaitHighexpr: avg by (instance) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 80for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} cpu iowait 使用率過高"description: "CPU iowait 使用率超過 50%"value: "{{ $value }}"- alert: NodeLoad5Highexpr: node_load5 > (count by (instance) (node_cpu_seconds_total{mode='system'})) * 1.2for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} load(5m) 過高"description: "Load(5m) 過高,超出cpu核數 1.2倍"value: "{{ $value }}"- alert: NodeMemoryHighexpr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 10for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} memory 使用率過高"description: "Memory 使用率超過 10%"value: "{{ $value }}"- alert: NodeDiskRootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 50for: 1mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分區) 使用率過高"description: "Disk(/ 分區) 使用率超過 50%"value: "{{ $value }}"- alert: NodeDiskBootHighexpr: (1 - node_filesystem_avail_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 50for: 10slabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分區) 使用率過高"description: "Disk(/boot 分區) 使用率超過 50%"value: "{{ $value }}"- alert: NodeDiskReadHighexpr: irate(node_disk_read_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 讀取字節數 速率過高"description: "Disk 讀取字節數 速率超過 20 MB/s"value: "{{ $value }}"- alert: NodeDiskWriteHighexpr: irate(node_disk_written_bytes_total[5m]) > 20 * (1024 ^ 2)for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk 寫入字節數 速率過高"description: "Disk 寫入字節數 速率超過 20 MB/s"value: "{{ $value }}"- alert: NodeDiskReadRateCountHighexpr: irate(node_disk_reads_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒讀取速率過高"description: "Disk iops 每秒讀取速率超過 3000 iops"value: "{{ $value }}"- alert: NodeDiskWriteRateCountHighexpr: irate(node_disk_writes_completed_total[5m]) > 3000for: 5mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk iops 每秒寫入速率過高"description: "Disk iops 每秒寫入速率超過 3000 iops"value: "{{ $value }}"- alert: NodeInodeRootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/ 分區) inode 使用率過高"description: "Disk (/ 分區) inode 使用率超過 80%"value: "{{ $value }}"- alert: NodeInodeBootUsedPercentHighexpr: (1 - node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} disk(/boot 分區) inode 使用率過高"description: "Disk (/boot 分區) inode 使用率超過 80%"value: "{{ $value }}"- alert: NodeFilefdAllocatedPercentHighexpr: node_filefd_allocated / node_filefd_maximum * 100 > 80for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} filefd 打開百分比過高"description: "Filefd 打開百分比 超過 80%"value: "{{ $value }}"- alert: NodeNetworkNetinBitRateHighexpr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 接收比特數 速率過高"description: "Network 接收比特數 速率超過 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetoutBitRateHighexpr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} network 發送比特數 速率過高"description: "Network 發送比特數 速率超過 20MB/s"value: "{{ $value }}"- alert: NodeNetworkNetinPacketErrorRateHighexpr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 接收錯誤包 速率過高"description: "Network 接收錯誤包 速率超過 15個/秒"value: "{{ $value }}"- alert: NodeNetworkNetoutPacketErrorRateHighexpr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15for: 3mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 發送錯誤包 速率過高"description: "Network 發送錯誤包 速率超過 15個/秒"value: "{{ $value }}"- alert: NodeProcessBlockedHighexpr: node_procs_blocked{job="node"} > 10for: 10mlabels:severity: warninginstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 當前被阻塞的任務的數量過多"description: "Process 當前被阻塞的任務的數量超過 10個"value: "{{ $value }}"- alert: NodeTimeOffsetHighexpr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60for: 2mlabels:severity: infoinstance: "{{ $labels.instance }}"annotations:summary: "instance: {{ $labels.instance }} 時間偏差過大"description: "Time 節點的時間偏差超過 3m"value: "{{ $value }}"
?
?
https://segmentfault.com/a/1190000043690204
prometheus結合consul+confd實現動態注冊服務和動態更新配置告警規則_prometheus confd-CSDN博客
如若想動態修改下面規則內容;? ?可采用以下方案;
rule_files:
? # - "first_rules.yml"
? # - "second_rules.yml"
? - "/rules/*_rules.yaml"
?
# Download the binary
wget https://github.com/kelseyhightower/confd/releases/download/v0.16.0/confd-0.16.0-linux-amd64
?
# 重命名二進制文件,并移動到PATH的目錄下
mv confd-0.16.0-linux-amd64 /usr/local/bin/confd
chmod +x /usr/local/bin/confd
?
# 驗證是否安裝成功
confd --helpsudo mkdir -p /etc/confd/{conf.d,templates,rules}