INFRA-2999 | Saqib | Script to update labels of elastic stack prometheus rules

This commit is contained in:
Saqib Perwaiz
2024-03-20 17:18:33 +05:30
parent 2023ac4a16
commit 8ffe41f3f4

View File

@@ -38,7 +38,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "The heap usage is over 80% for 15m\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`", "description": "The heap usage is over 80% for 15m\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`",
"summary": "Elasticsearch Heap Usage warning (node `{{ $labels.node }}`)" "summary": "Elasticsearch Heap Usage warning (node `{{ $labels.node }}`)"
}, },
"expr": f"(es_jvm_mem_heap_used_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}} / es_jvm_mem_heap_max_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003e 80", "expr": f"(es_jvm_mem_heap_used_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}} / es_jvm_mem_heap_max_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 > 80",
"for": "15m", "for": "15m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -52,7 +52,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "The disk usage is over 85%\n VALUE = `{{ $value }}`", "description": "The disk usage is over 85%\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch average disk out of space (node - `{{ $labels.node }}`). No new shards will be allocated at this node" "summary": "Elasticsearch average disk out of space (node - `{{ $labels.node }}`). No new shards will be allocated at this node"
}, },
"expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003c 15", "expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 < 15",
"for": "20m", "for": "20m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -66,7 +66,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "The disk usage is over 90%\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`", "description": "The disk usage is over 90%\n VALUE = `{{ $value }}`\n NAME: `{{ $labels.node }}`",
"summary": "Elasticsearch disk out of space (node `{{ $labels.node }}`). No new shards will be allocated at this node" "summary": "Elasticsearch disk out of space (node `{{ $labels.node }}`). No new shards will be allocated at this node"
}, },
"expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 \u003c 10", "expr": f"(es_fs_total_free_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}/es_fs_total_total_bytes{{job=~\".*http\",es_cluster=\"{app_name}\"}}) * 100 < 10",
"for": "10m", "for": "10m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -108,7 +108,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "Elastic Cluster Index Replica less than 1 for 15 minutes\n VALUE = `{{ $value }}`", "description": "Elastic Cluster Index Replica less than 1 for 15 minutes\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch Cluster Index Replica less than 1 (cluster - `{{ $labels.es_cluster }}`)" "summary": "Elasticsearch Cluster Index Replica less than 1 (cluster - `{{ $labels.es_cluster }}`)"
}, },
"expr": f"min(es_index_replicas_number{{job=~\".*http\",es_cluster=\"{app_name}\",index!~\"^[.].*\"}}) by (es_cluster,index) \u003c 1", "expr": f"min(es_index_replicas_number{{job=~\".*http\",es_cluster=\"{app_name}\",index!~\"^[.].*\"}}) by (es_cluster,index) < 1",
"for": "15m", "for": "15m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -122,7 +122,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "Number of initializing shards for 10 min\n VALUE = `{{ $value }}`", "description": "Number of initializing shards for 10 min\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch initializing shards (cluster `{{ $labels.es_cluster }}`)" "summary": "Elasticsearch initializing shards (cluster `{{ $labels.es_cluster }}`)"
}, },
"expr": f"max(es_cluster_shards_number{{type=\"initializing\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0", "expr": f"max(es_cluster_shards_number{{type=\"initializing\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) > 0",
"for": "10m", "for": "10m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -136,7 +136,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "Number of unassigned shards for 30 min\n VALUE = `{{ $value }}`", "description": "Number of unassigned shards for 30 min\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)" "summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)"
}, },
"expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0", "expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) > 0",
"for": "30m", "for": "30m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -150,7 +150,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "Number of unassigned shards for 15 min\n VALUE = `{{ $value }}`", "description": "Number of unassigned shards for 15 min\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)" "summary": "Elasticsearch unassigned shards (cluster `{{ $labels.es_cluster }}`)"
}, },
"expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0", "expr": f"max(es_cluster_shards_number{{type=\"unassigned\",job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) > 0",
"for": "15m", "for": "15m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,
@@ -164,7 +164,7 @@ def update_es_rule(api_instance, rule, namespace):
"description": "Number of pending tasks for 15 min. Cluster works slowly.\n VALUE = `{{ $value }}`", "description": "Number of pending tasks for 15 min. Cluster works slowly.\n VALUE = `{{ $value }}`",
"summary": "Elasticsearch pending tasks (cluster `{{ $labels.es_cluster }}`)" "summary": "Elasticsearch pending tasks (cluster `{{ $labels.es_cluster }}`)"
}, },
"expr": f"max(es_cluster_pending_tasks_number{{job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) \u003e 0", "expr": f"max(es_cluster_pending_tasks_number{{job=~\".*http\",es_cluster=\"{app_name}\"}}) by (es_cluster) > 0",
"for": "15m", "for": "15m",
"labels": { "labels": {
"alertTeam": app_team, "alertTeam": app_team,