1、部署VictoriaMetrics
cd /usr/local
wget https://github.com/VictoriaMetrics/VictoriaMetrics/releases/download/v1.65.0/victoria-metrics-amd64-v1.65.0.tar.gz
mkdir victoria-metrics && tar -xvzf victoria-metrics-amd64-v1.65.0.tar.gz && \
mv victoria-metrics-prod victoria-metrics/victoria-metrics && cd victoria-metrics
nohup ./victoria-metrics -retentionPeriod=30d -storageDataPath=data &
2、配置Prometheus
# my global config
global:scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.# scrape_timeout is set to the global default (10s).external_labels:datacenter: "victoria-1"
# 遠程寫入victoria
remote_write:- url: "http://127.0.0.1:8428/api/v1/write"queue_config:max_samples_per_send: 10000
# Alertmanager configuration
alerting:alertmanagers:- static_configs:- targets:- localhost:9093# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:- "/usr/local/prometheus/rules/rule_node_down.yml"- "/usr/local/prometheus/rules/rule_disk_over.yml"- "/usr/local/prometheus/rules/rule_cpu_over.yml"- "/usr/local/prometheus/rules/rule_memory_over.yml"# - "second_rules.yml"# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:# The job name is added as a label to any timeseries scraped from this config.- job_name: 'prometheus'static_configs:- targets: ['localhost:9090']
# 用于配置victoria- job_name: 'victoria'static_configs:- targets: ['47.105.38.224:8480']- targets: ['47.105.38.224:8481']- targets: ['47.105.38.224:8482']# metrics_path defaults to '/metrics'# # scheme defaults to 'http'.- job_name: 'consul-prometheus'# metrics_path: "/v1/agent/metrics"scrape_interval: 60sscrape_timeout: 10sscheme: httpparams:format: ['prometheus']#static_configs:# - targets:# - 47.105.38.224:8500consul_sd_configs:- server: '47.105.38.224:8500'services: []relabel_configs:- source_labels: [__metrics_path__]separator: ;regex: /metricstarget_label: __metrics_path__replacement: /actuator/prometheusaction: replace- source_labels: ['__meta_consul_tags']regex: '^.*,metrics=true,.*$'action: keep- job_name: "node20_exporter"static_configs:- targets: ["localhost:9100"]- job_name: "node21_exporter"static_configs:- targets: ["172.16.17.21:9100"] #監控主機- job_name: "node22_exporter"static_configs:- targets: ["172.16.17.22:9100"]- job_name: "node23_exporter"static_configs:- targets: ["172.16.17.23:9100"] #監控主機- job_name: "alertmanager"static_configs:- targets: ["localhost:9093"]
?
?
3、配置Grafana數據源
4、構建vmalert
從源代碼構建vmalert:
git clone https://github.com/VictoriaMetrics/VictoriaMetrics
cd VictoriaMetrics
make vmalert
構建二進制文件將放置在VictoriaMetrics/bin文件夾中。
5、添加alert.rules
vim alert.rules#rule示例
groups:- name: test-rulerules:- alert: 主機狀態expr: up == 0for: 2mlabels:status: warningannotations:summary: "{{$labels.instance}}:服務器關閉"description: "{{$labels.instance}}:服務器關閉"
6、修改釘釘prometheus-webhook-dingtalk配置文件
vim /usr/local/prometheus-webhook-dingtalk/config.example.yml## Request timeout
# timeout: 5s## Uncomment following line in order to write template from scratch (be careful!)
#no_builtin_template: true## Customizable templates path
templates:- '/usr/local/alertmanager/template/default.tmpl'## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
#default_message:
# title: '{{ template "legacy.title" . }}'
# text: '{{ template "legacy.content" . }}'## Targets, previously was known as "profiles"
targets:webhook1:url: https://oapi.dingtalk.com/robot/send?access_token=XXXXXXXX# secret for signaturesecret: SEC000000000000000000000
# webhook2:
# url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
# webhook_legacy:# url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx# Customize template contentmessage:# Use legacy template# title: '{{ template "legacy.title" . }}'text: '{{ template "wechat.default.message" . }}'webhook_mention_all:url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxmention:all: truewebhook_mention_users:url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxmention:mobiles: ['156xxxx8827', '189xxxx8325']
?
7、修改alertmanager配置文件
vim /usr/local/alertmanager/alertmanager.ymlglobal:resolve_timeout: 5m # 處理超時時間,默認為5mintemplates: # 指定郵件模板的路徑,可以使用相對路徑,template/*.tmpl的方式- '/usr/local/alertmanager/template/default.tmpl'
# 定義路由樹信息
route:group_by: [alertname] # 報警分組依據receiver: ops_notify # 設置默認接收人group_wait: 30s # 最初即第一次等待多久時間發送一組警報的通知group_interval: 60s # 在發送新警報前的等待時間repeat_interval: 1h # 重復發送告警時間。默認1hroutes:- receiver: ops_notify # 基礎告警通知人group_wait: 10smatch_re:alertname: 實例存活告警|磁盤使用率告警 # 匹配告警規則中的名稱發送- receiver: info_notify # 消息告警通知人group_wait: 10smatch_re:alertname: 內存使用率告警|CPU使用率告警|目錄大小告警# 定義基礎告警接收者
receivers:
- name: ops_notifywebhook_configs:- url: http://localhost:8060/dingtalk/webhook1/sendsend_resolved: true # 警報被解決之后是否通知
# message: '{{ template "wechat.default.message" . }}'# 定義消息告警接收者
- name: info_notifywebhook_configs:- url: http://localhost:8060/dingtalk/webhook1/sendsend_resolved: true# message: '{{ template "wechat.default.message" . }}'# 一個inhibition規則是在與另一組匹配器匹配的警報存在的條件下,使匹配一組匹配器的#警報失效的規則。兩個警報必須具有一組相同的標簽。
inhibit_rules:- source_match:severity: 'critical'target_match:severity: 'warning'equal: ['alertname', 'dev', 'instance']
8、啟動vmalert
./bin/vmalert -rule=alert.rules \-datasource.url=http://localhost:8428 \-notifier.url=http://localhost:9093 &