由于公司需要监控目标类型较多,不能手动去改动prometheus规则然后reload,所以就通过python写了个程序自动更新prometheus配置
基本环境准备
- python 3.10.10
- flask 2.3.2
- prometheus 2.52.0
基本流程
- 将接口传来的prometheus规则信息保存到数据表中
- 取数据表中所有prometheus规则生成规则文件保存到本地临时文件夹内
- 获取需要修改prometheus机器ip
- 根据第三步获取的ip读取之前的prometheus规则备用
- 根据第三部获取的ip删除之前prometheus规则文件
- 把第二步生成的规则文件上传到第三步获取ip机器上
- 通过调用http://{ip}:9090/-/reload接口让配置文件重新生效
- 如果新的规则文件未生效,把第四部备用规则文件上传
- 清除第二步中的临时文件夹
以上为开发流程,在基本环境准备好的前提下开始开发,本文涉及的kevin模块导入均为本人开发功。
1.数据表创建及模型开发
DROP TABLE IF EXISTS `prom_ruler`;
CREATE TABLE `prom_ruler` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`expr` varchar(1023) DEFAULT NULL,
`duration` varchar(16) DEFAULT NULL,
`severity` varchar(15) DEFAULT NULL,
`summary` text,
`description` text,
`datasource` int(11) DEFAULT NULL,
`group` int(11) DEFAULT NULL,
`updated_on` datetime DEFAULT NULL,
`created_on` datetime DEFAULT NULL,
`heal` int(11) DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=DYNAMIC;
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from sqlalchemy import Column, Integer, String, TEXT
from kevin.sqlalchemy_utils.model import Model
class PromRulerModel(Model):
"""
prom ruler
"""
__tablename__ = "prom_ruler"
expr = Column(String(100))
duration = Column(String(10))
severity = Column(String)
summary = Column(TEXT)
description = Column(TEXT)
datasource = Column(Integer)
group = Column(Integer)
heal = Column(Integer)
2.业务逻辑处理
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import traceback
from typing import List
import yaml
import requests
from flask import request
from kevin.common.http_code import HttpCode
from kevin.log import logger
from kevin.sqlalchemy_utils.model_utils import list_to_dictlist
from kevin.utils import pretty_result
from config import Config
from extensions.ssh_ext import read_remote_file, exec_ssh_one_cmd, upload_file_to_linux
from models.alarm_heal import AlarmHeal
from models.common import GroupModel, DataSourceModel
from models.prometheus import PromRulerModel, PromRuleClassifyModel
from resource.v1.dc2_monitor_mixin import MonitorMixin
class PrometheusDbView(MonitorMixin):
@staticmethod
def generate_prometheus_rule_model(prometheus_rule: dict) -> PromRulerModel:
prom_rule_model = PromRulerModel(
expr=prometheus_rule["monitorMetrics"] + prometheus_rule["symbols"] + prometheus_rule["alarmThreshold"],
duration=prometheus_rule["duration"],
severity=prometheus_rule["severity"],
summary=prometheus_rule["summary"],
description=prometheus_rule["description"],
datasource=prometheus_rule["datasource"],
group=prometheus_rule["group"]
)
if "heal" in prometheus_rule and prometheus_rule["heal"] and prometheus_rule["heal"] != "None":
prom_rule_model.heal = prometheus_rule["heal"]
else:
prom_rule_model.heal = 0
return prom_rule_model
@staticmethod
def save_prometheus_rule(prom_rule_model) -> None:
add_prom_rule_model_res = prom_rule_model.objects.add()
if not add_prom_rule_model_res:
raise Exception("prom rule was failed deposited into the database.")
@staticmethod
def delete_prometheus_rule(prometheus_rule_id: int) -> dict:
prom_rule = PromRulerModel.objects_().get_by_id(prometheus_rule_id).__dict__
PromRulerModel.objects_().delete(PromRulerModel.id == prometheus_rule_id)
return prom_rule
def save_prometheus_rule_to_db(self, prometheus_rule: dict) -> None:
logger.info("Start save prometheus rule to db.")
prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)
self.save_prometheus_rule(prom_rule_model=prom_rule_model)
logger.info("Success save prometheus rule to db.")
@staticmethod
def get_prometheus_rule_by_db() -> List[dict]:
prom_rule = PromRulerModel.objects_().list() or []
prom_rule = list_to_dictlist(prom_rule)
for rule in prom_rule:
if "datasource" in rule and rule["datasource"] != 'None':
rule["datasource_id"] = int(rule["datasource"])
rule["datasource"] = DataSourceModel.objects_().get_by_id(int(rule["datasource"])).name
if "group" in rule:
rule["group_id"] = int(rule["group"])
try:
rule["group"] = GroupModel.objects_().get_by_id(int(rule["group"])).name
except Exception as e:
logger.error("group deleted!")
return prom_rule
def update_prometheus_rule_by_db(self, prometheus_rule: dict) -> None:
logger.info("Start update prometheus rule to db.")
prom_rule_model = self.generate_prometheus_rule_model(prometheus_rule=prometheus_rule)
self.delete_prometheus_rule(prometheus_rule_id=prometheus_rule["id"])
self.save_prometheus_rule(prom_rule_model=prom_rule_model)
logger.info("Finish update prometheus rule to db.")
class PrometheusRuleView(MonitorMixin):
@staticmethod
def generate_prometheus_ips(prometheus_rule: dict) -> list:
datasource_id = int(prometheus_rule["datasource"])
url = DataSourceModel.objects_().get_by_id(datasource_id).url
return url.split(",")
def read_prom_config(self, ip: str) -> list:
host_info = self._get_host_info(ip)
prom_rule_old_yml = read_remote_file(host_info=host_info, remote_path=Config.PROMETHEUS_CONFIG_PATH)
return prom_rule_old_yml
def generate_prometheus_yml(self) -> None:
prometheus_yml = {
"groups": [
{
"name": "default_group",
"rules": []
}
]
}
prom_rule_model = PromRulerModel.objects_().list() or []
prom_rule_list = list_to_dictlist(prom_rule_model)
for prom_rule in prom_rule_list:
try:
prom_rule_dict = {
"alert": GroupModel.objects_().get_by_id(int(prom_rule["group"])).name,
"annotations": {
"summary": prom_rule["summary"],
"description": prom_rule["description"],
},
"expr": prom_rule["expr"],
"for": prom_rule["duration"],
"labels": {
"prom_id": prom_rule["id"],
"severity": prom_rule["severity"]
}
}
prometheus_yml["groups"][0]["rules"].append(prom_rule_dict)
except Exception as e:
logger.error(f"generate prom rule, error: {e}, {traceback.format_exc()}")
logger.error(f"prom rule {prom_rule}")
self.generate_conf_tmp_dir()
with open(Config.PROMETHEUS_CONFIG_LOCAL_PATH, "w", encoding='utf-8') as f:
yaml.dump(prometheus_yml, f, allow_unicode=True)
@staticmethod
def generate_old_prometheus_yml(prom_rule_old_yml) -> None:
with open(Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH, "w", encoding='utf-8') as f:
yaml.dump(prom_rule_old_yml, f, allow_unicode=True)
def remove_remote_prometheus_rules(self, ip: str) -> None:
remove_remote_prometheus_rule_cmd = f"rm -f {Config.PROMETHEUS_CONFIG_PATH}"
host_info = self._get_host_info(ip)
exec_ssh_one_cmd(host_info=host_info, command=remove_remote_prometheus_rule_cmd)
def upload_prometheus_rules_file(self, ip: str, local_host: str) -> None:
host_info = self._get_host_info(ip)
upload_file_to_linux(
host_info=host_info,
local_path=local_host,
remote_path=Config.PROMETHEUS_CONFIG_PATH
)
@staticmethod
def reload_prometheus_rules(ip: str) -> bool:
url = f"http://{ip}:9090/-/reload"
resp = requests.post(url=url)
if resp.status_code == 200:
logger.info(f"Success reload prom rule {ip}")
return True
else:
logger.error(f"Failed to reload prom rule {ip}")
return False
def reload_prometheus_conf(self, prometheus_rule: dict) -> None:
# 2.生成新的规则文件到本地
self.generate_prometheus_yml()
prometheus_ips = self.generate_prometheus_ips(prometheus_rule=prometheus_rule)
# 3.获取ip
for prometheus_ip in prometheus_ips:
# 4.读取远程规则作为旧规则作保障
prom_old_rule = self.read_prom_config(ip=prometheus_ip)
# 5.删除远程规则文件
self.remove_remote_prometheus_rules(ip=prometheus_ip)
# 6.上传新的规则文件
self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_LOCAL_PATH)
# 7.进行远程reload
reload_result = self.reload_prometheus_rules(ip=prometheus_ip)
# 8.若失败把旧的规则文件生成并上传远程服务器
if not reload_result:
# 9.生成旧的prom配置文件
self.generate_old_prometheus_yml(prom_old_rule)
# 10.上传旧的文件
self.upload_prometheus_rules_file(ip=prometheus_ip, local_host=Config.PROMETHEUS_CONFIG_OLD_LOCAL_PATH)
self.reload_prometheus_rules(ip=prometheus_ip)
# 清楚临时生成的文件夹
self.clear_conf_tmp_dir()
class PrometheusView(PrometheusDbView, PrometheusRuleView):
def get(self):
rule_classify_id = request.values.get("ruleClassifyId")
page = request.values.get("page")
pagesize = request.values.get("pageSize")
# 规则分类页面获取规则,没有分页
if rule_classify_id is None and page is None and pagesize is None:
prom_rules = self.get_prometheus_rule_by_db()
return pretty_result(code=HttpCode.OK, data=prom_rules)
if rule_classify_id == "0" or rule_classify_id is None:
prom_rules = self.get_prometheus_rule_by_db()
else:
prom_rule_classify = PromRuleClassifyModel.objects_().get_by_id(int(rule_classify_id))
if prom_rule_classify is None:
prom_rules = []
else:
rule_ids = prom_rule_classify.rule_ids.split(",")
prom_rule_obj = PromRulerModel.objects_().list_by_ids(rule_ids)
prom_rules = []
for prom_rule in prom_rule_obj:
rule = {
"datasource_id": prom_rule.datasource,
"datasource": DataSourceModel.objects_().get_by_id(prom_rule.datasource).name,
"description": prom_rule.description,
"duration": prom_rule.duration,
"expr": prom_rule.expr,
"group_id": prom_rule.group,
"group": GroupModel.objects_().get_by_id(prom_rule.group).name,
"id": prom_rule.id,
"severity": prom_rule.severity,
"summary": prom_rule.summary,
"heal": prom_rule.heal
}
if prom_rule.heal and int(prom_rule.heal) != 0:
rule["heal_title"] = AlarmHeal.objects_().get_by_id(int(prom_rule.heal)).title
else:
rule["heal_title"] = ""
prom_rules.append(rule)
total = len(prom_rules)
prom_rules = prom_rules[(int(page) - 1) * int(pagesize): int(page) * int(pagesize)]
return pretty_result(code=HttpCode.OK, data={"prom_rules": prom_rules, "total": total})
def post(self):
prometheus_rule = request.get_json(force=True)
# 1.规则存入数据库
self.save_prometheus_rule_to_db(prometheus_rule=prometheus_rule)
self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
return pretty_result(code=HttpCode.OK)
def put(self):
prometheus_rule = request.get_json(force=True)
self.update_prometheus_rule_by_db(prometheus_rule=prometheus_rule)
self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
return pretty_result(code=HttpCode.OK)
def delete(self):
prometheus_rule_id = request.get_json(force=True)
prometheus_rule = self.delete_prometheus_rule(prometheus_rule_id=int(prometheus_rule_id))
self.reload_prometheus_conf(prometheus_rule=prometheus_rule)
return pretty_result(code=HttpCode.OK)
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import shutil
import traceback
from flask_restful import Resource
from kevin.log import logger
from config import Config
from data_entity import HostInfo
class MonitorMixin(Resource):
@staticmethod
def _get_host_info(ip):
host_info = HostInfo(
hostname=ip,
username=Config.MONITOR_USERNAME, # prometheus机器用户名
password=Config.MONITOR_PASSWORD, # prometheus机器密码
port=22
)
return host_info
@staticmethod
def clear_conf_tmp_dir() -> None:
try:
tmp_path = os.path.join(Config.PROJECT_DIR, "tmp")
shutil.rmtree(tmp_path)
logger.info(f"Success clear conf tmp dir: {tmp_path}")
except Exception as e:
logger.error(f"Failed to clear conf tmp dir, error: {e}, {traceback.format_exc()}")
@staticmethod
def generate_conf_tmp_dir() -> None:
yaml_dir = os.path.join(Config.PROJECT_DIR, "tmp")
if not os.path.isdir(yaml_dir):
os.mkdir(yaml_dir)
3.接口实现
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from .v1 import *
from flask_restful import Api
api = Api(prefix="/api/v1")
api.add_resource(PrometheusView, "/prometheusconfigs")