224 lines
9.4 KiB
Python
224 lines
9.4 KiB
Python
from kubernetes import client, config
|
|
|
|
|
|
def update_es_rule(api_instance, rule, namespace):
|
|
if len(rule["spec"]["groups"]) == 0:
|
|
print(f"PrometheusRule: {namespace} {rule['metadata']['name']} has no rules")
|
|
return
|
|
|
|
if "labels" not in rule["metadata"] or "Team" not in rule["metadata"]["labels"]:
|
|
print(
|
|
f"PrometheusRule: {namespace} {rule['metadata']['name']} has no Team label"
|
|
)
|
|
return
|
|
|
|
app_name = rule["metadata"]["name"]
|
|
app_name = app_name.split("-alerts")[0]
|
|
app_team = rule["metadata"]["labels"]["Team"]
|
|
|
|
rule["spec"]["groups"][0]["rules"] = []
|
|
new_relabeled_rule = [
|
|
{
|
|
"alert": "ElasticsearchHeapUsageTooHigh",
|
|
"annotations": {
|
|
"description": "The heap usage is over 90% for 5m VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`",
|
|
"summary": "Elasticsearch Heap Usage Too High (node `{{ $labels.node }}`)"
|
|
},
|
|
"expr": f"(es_jvm_mem_heap_used_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}} / es_jvm_mem_heap_max_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 > 90",
|
|
"for": "20m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "critical"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchHeapUsageWarning",
|
|
"annotations": {
|
|
"description": "The heap usage is over 80% for 15m\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`",
|
|
"summary": "Elasticsearch Heap Usage warning (node `{{ $labels.node }}`)"
|
|
},
|
|
"expr": f"(es_jvm_mem_heap_used_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}} / es_jvm_mem_heap_max_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003e 80",
|
|
"for": "15m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchAvgDiskOutOfSpace_Warning",
|
|
"annotations": {
|
|
"description": "The disk usage is over 85%\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch average disk out of space (node - `{{ $labels.node }}`). No new shards will be allocated at this node"
|
|
},
|
|
"expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003c 15",
|
|
"for": "20m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchDiskOutOfSpace",
|
|
"annotations": {
|
|
"description": "The disk usage is over 90%\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`",
|
|
"summary": "Elasticsearch disk out of space (node `{{ $labels.node }}`). No new shards will be allocated at this node"
|
|
},
|
|
"expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003c 10",
|
|
"for": "10m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "critical"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchClusterRed",
|
|
"annotations": {
|
|
"description": "Elastic Cluster Red",
|
|
"summary": "Elasticsearch Cluster Red (cluster - `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_status{{job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) == 2",
|
|
"for": "5m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "critical"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchClusterYellow",
|
|
"annotations": {
|
|
"description": "Elastic Cluster Yellow for 15 minutes",
|
|
"summary": "Elasticsearch Cluster Yellow (cluster - `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_status{{job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) == 1",
|
|
"for": "15m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchClusterIndexReplicaUnavailable",
|
|
"annotations": {
|
|
"description": "Elastic Cluster Index Replica less than 1 for 15 minutes\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch Cluster Index Replica less than 1 (cluster - `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"min(es_index_replicas_number{{job=~\".*http\",es_cluster=\"{app_name}\",index!~\"^[.].*\"}}) by (es_cluster,index) \u003c 1",
|
|
"for": "15m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchInitializingShards",
|
|
"annotations": {
|
|
"description": "Number of initializing shards for 10 min\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch initializing shards (cluster `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_shards_number{{type=\"initializing\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0",
|
|
"for": "10m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchUnassignedShards",
|
|
"annotations": {
|
|
"description": "Number of unassigned shards for 30 min\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0",
|
|
"for": "30m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "critical"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchUnassignedShards",
|
|
"annotations": {
|
|
"description": "Number of unassigned shards for 15 min\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0",
|
|
"for": "15m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
},
|
|
{
|
|
"alert": "ElasticsearchPendingTasks",
|
|
"annotations": {
|
|
"description": "Number of pending tasks for 15 min. Cluster works slowly.\n VALUE = `{{ $value }}`",
|
|
"summary": "Elasticsearch pending tasks (cluster `{{ $labels.es_cluster }}`)"
|
|
},
|
|
"expr": f"max(es_cluster_pending_tasks_number{{job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0",
|
|
"for": "15m",
|
|
"labels": {
|
|
"alertTeam": app_team,
|
|
"appName": app_name,
|
|
"severity": "warning"
|
|
}
|
|
}
|
|
]
|
|
rule["spec"]["groups"][0]["rules"] = new_relabeled_rule
|
|
|
|
api_instance.replace_namespaced_custom_object(
|
|
group="monitoring.coreos.com",
|
|
version="v1",
|
|
plural="prometheusrules",
|
|
namespace=namespace,
|
|
name=rule["metadata"]["name"],
|
|
body=rule,
|
|
|
|
)
|
|
|
|
|
|
def get_all_es_rules(api_instance, namespaces):
|
|
for namespace in namespaces:
|
|
rules = api_instance.list_namespaced_custom_object(
|
|
group="monitoring.coreos.com",
|
|
version="v1",
|
|
plural="prometheusrules",
|
|
namespace=namespace,
|
|
)
|
|
|
|
for rule in rules.get("items", []):
|
|
rule_name = rule["metadata"]["name"]
|
|
if rule_name.endswith("-elasticsearch-alerts"):
|
|
print(f"Updating rule: {rule_name} in namespace: {namespace}")
|
|
update_es_rule(api_instance, rule, namespace)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
kubernetes_contexts = ["teleport.spike.navi-tech.in-spike.np.navi-tech.in"]
|
|
|
|
for context in kubernetes_contexts:
|
|
print(f"Modifying PrometheusRule resources for Kubernetes context: {context}")
|
|
try:
|
|
config.load_kube_config(context=context)
|
|
|
|
api_inst = client.CustomObjectsApi()
|
|
|
|
v1 = client.CoreV1Api()
|
|
ns = [ns.metadata.name for ns in v1.list_namespace().items]
|
|
get_all_es_rules(api_inst, ns)
|
|
print(
|
|
f"PrometheusRule modifications completed for Kubernetes context: {context}"
|
|
)
|
|
print("--------------------------------------------------------------")
|
|
except Exception as e:
|
|
print(f"Error occurred for Kubernetes context {context}: {str(e)}")
|